Name | Executed | Routines | % | Executed | Lines | % | Unexecuted |
/home/matt/eu/rds/include/std/regex.e | 19 | 19 | 100.00% | 172 | 178 | 96.63% | 6 |
Routine | Executed | Lines | Unexecuted | |
option_spec() | 7 | 10 | 70.00% | 3 |
find() | 4 | 5 | 80.00% | 1 |
get_ovector_size() | 4 | 5 | 80.00% | 1 |
matches() | 14 | 15 | 93.33% | 1 |
all_matches() | 14 | 14 | 100.00% | 0 |
error_message() | 2 | 2 | 100.00% | 0 |
error_to_string() | 4 | 4 | 100.00% | 0 |
escape() | 2 | 2 | 100.00% | 0 |
find_all() | 12 | 12 | 100.00% | 0 |
find_replace() | 2 | 2 | 100.00% | 0 |
find_replace_callback() | 17 | 17 | 100.00% | 0 |
find_replace_limit() | 3 | 3 | 100.00% | 0 |
has_match() | 2 | 2 | 100.00% | 0 |
is_match() | 5 | 5 | 100.00% | 0 |
new() | 3 | 3 | 100.00% | 0 |
option_spec_to_string() | 2 | 2 | 100.00% | 0 |
regex() | 2 | 2 | 100.00% | 0 |
split() | 2 | 2 | 100.00% | 0 |
split_limit() | 14 | 14 | 100.00% | 0 |
# | Executed | |
1 | -- (c) Copyright - See License.txt | |
2 | namespace regex | |
3 | ||
4 | include std/math.e | |
5 | include std/text.e | |
6 | include std/types.e | |
7 | include std/flags.e as flags | |
8 | include std/error.e | |
9 | include std/search.e | |
10 | ||
11 | --**** | |
12 | -- == Regular Expressions | |
13 | -- | |
14 | -- < | |
15 | -- | |
16 | -- === Introduction | |
17 | -- | |
18 | -- Regular expressions in Euphoria are based on the PCRE (Perl Compatible Regular Expressions) | |
19 | -- library created by Philip Hazel. | |
20 | -- | |
21 | -- This document will detail the Euphoria interface to Regular Expressions, not really | |
22 | -- regular expression syntax. It is a very complex subject that many books have been | |
23 | -- written on. Here are a few good resources online that can help while learning | |
24 | -- regular expressions. | |
25 | -- | |
26 | -- * [[EUForum Article -> http://openeuphoria.org/wiki/euwiki.cgi?EuGuide%20Regular%20Expressions ]] | |
27 | -- * [[Perl Regular Expressions Man Page -> http://perldoc.perl.org/perlre.html]] | |
28 | -- * [[Regular Expression Library -> http://regexlib.com/]] (user supplied regular | |
29 | -- expressions for just about any task). | |
30 | -- * [[WikiPedia Regular Expression Article -> http://en.wikipedia.org/wiki/Regular_expression]] | |
31 | -- * [[Man page of PCRE in HTML -> http://www.slabihoud.de/software/archives/pcrecompat.html]] | |
32 | -- === General Use | |
33 | -- | |
34 | -- Many functions take an optional ##options## parameter. This parameter can be either | |
35 | -- a single option constant (see [[:Option Constants]]), multiple option constants or'ed | |
36 | -- together into a single atom or a sequence of options, in which the function will take | |
37 | -- care of ensuring the are or'ed together correctly. Options are like their C equivalents | |
38 | -- with the 'PCRE_' prefix stripped off. Name spaces disambiguate symbols so we don't | |
39 | -- need this prefix. | |
40 | -- | |
41 | -- All strings passed into this library must be either 8-bit per character strings or | |
42 | -- UTF which uses multiple bytes to encode UNICODE characters. You can | |
43 | -- use UTF8 encoded UNICODE strings when you pass the UTF8 option. | |
44 | ||
45 | 6 | enum M_PCRE_COMPILE=68, M_PCRE_FREE, M_PCRE_EXEC, M_PCRE_REPLACE, M_PCRE_ERROR_MESSAGE=95, M_PCRE_GET_OVECTOR_SIZE=97 |
46 | ||
47 | --**** | |
48 | -- === Option Constants | |
49 | -- | |
50 | -- ==== Compile Time and Match Time | |
51 | -- | |
52 | -- When a regular expression object is created via ##new## we call also say it get's "compiled." | |
53 | -- The options you may use for this are called "compile time" option constants. Once | |
54 | -- the regular expression is created you can use the other functions that take this regular | |
55 | -- expression and a string. These routines' options are called "match time" option constants. | |
56 | -- To not set any options at all, do not supply the options argument or supply [[:DEFAULT]]. | |
57 | -- | |
58 | -- ===== Compile Time Option Constants | |
59 | -- | |
60 | -- The only options that may set at "compile time"; that is, to pass to ##new##; | |
61 | -- are [[:ANCHORED]], [[:AUTO_CALLOUT]], [[:BSR_ANYCRLF]], [[:BSR_UNICODE]], [[:CASELESS]], | |
62 | -- [[:DEFAULT]], [[:DOLLAR_ENDONLY]], [[:DOTALL]], [[:DUPNAMES]], [[:EXTENDED]], [[:EXTRA]], | |
63 | -- [[:FIRSTLINE]], [[:MULTILINE]], [[:NEWLINE_CR]], [[:NEWLINE_LF]], [[:NEWLINE_CRLF]], | |
64 | -- [[:NEWLINE_ANY]], [[:NEWLINE_ANYCRLF]], [[:NO_AUTO_CAPTURE]], [[:NO_UTF8_CHECK]], | |
65 | -- [[:UNGREEDY]], and [[:UTF8]]. | |
66 | -- | |
67 | -- | |
68 | -- ===== Match Time Option Constants | |
69 | -- | |
70 | -- Options that may be set at "match time" are [[:ANCHORED]], [[:NEWLINE_CR]], [[:NEWLINE_LF]], | |
71 | -- [[:NEWLINE_CRLF]], [[:NEWLINE_ANY]] [[:NEWLINE_ANYCRLF]] [[:NOTBOL]], [[:NOTEOL]], | |
72 | -- [[:NOTEMPTY]], [[:NO_UTF8_CHECK]]. Routines that take match time option constants match, | |
73 | -- split or replace a regular expression against some string. | |
74 | -- | |
75 | -- | |
76 | ||
77 | --**** | |
78 | -- Signature: | |
79 | -- public constant ANCHORED | |
80 | -- | |
81 | -- Description: | |
82 | -- Forces matches to be only from the first place it is asked to | |
83 | -- try to make a search. | |
84 | -- In C, this is called PCRE_ANCHORED. | |
85 | -- This is passed to all routines including [[:new]]. | |
86 | ||
87 | --**** | |
88 | -- Signature: | |
89 | -- public constant AUTO_CALLOUT | |
90 | -- | |
91 | -- Description: | |
92 | -- In C, this is called PCRE_AUTO_CALLOUT. | |
93 | -- To get the functionality of this flag in EUPHORIA, you can use: | |
94 | -- [[:find_replace_callback]] without passing this option. | |
95 | -- This is passed to [[:new]]. | |
96 | ||
97 | --**** | |
98 | -- Signature: | |
99 | -- public constant BSR_ANYCRLF | |
100 | -- | |
101 | -- Description: | |
102 | -- With this option only ASCII new line sequences are recognized as newlines. Other UNICODE | |
103 | -- newline sequences (encoded as UTF8) are not recognized as an end of line marker. | |
104 | -- This is passed to all routines including [[:new]]. | |
105 | ||
106 | --**** | |
107 | -- Signature: | |
108 | -- public constant BSR_UNICODE | |
109 | -- | |
110 | -- Description: | |
111 | -- With this option any UNICODE new line sequence is recognized as a newline. | |
112 | -- The UNICODE will have to be encoded as UTF8, however. | |
113 | -- This is passed to all routines including [[:new]]. | |
114 | ||
115 | --**** | |
116 | -- Signature: | |
117 | -- public constant CASELESS | |
118 | -- | |
119 | -- Description: | |
120 | -- This will make your regular expression matches case insensitive. With this | |
121 | -- flag for example, [a-z] is the same as [A-Za-z]. | |
122 | -- This is passed to [[:new]]. | |
123 | ||
124 | --**** | |
125 | -- Signature: | |
126 | -- public constant DEFAULT | |
127 | -- | |
128 | -- Description: | |
129 | -- This is a value used for not setting any flags at all. This can be passed to | |
130 | -- all routines including [[:new]] | |
131 | ||
132 | --**** | |
133 | -- Signature: | |
134 | -- public constant DFA_SHORTEST | |
135 | -- | |
136 | -- Description: | |
137 | -- This is NOT used by any standard library routine. | |
138 | ||
139 | --**** | |
140 | -- Signature: | |
141 | -- public constant DFA_RESTART | |
142 | -- | |
143 | -- Description: | |
144 | -- This is NOT used by any standard library routine. | |
145 | ||
146 | --**** | |
147 | -- Signature: | |
148 | -- public constant DOLLAR_ENDONLY | |
149 | -- | |
150 | -- Description: | |
151 | -- If this bit is set, a dollar sign metacharacter in the pattern matches only | |
152 | -- at the end of the subject string. Without this option, a dollar sign also | |
153 | -- matches immediately before a newline at the end of the string (but not | |
154 | -- before any other newlines). Thus you must include the newline character | |
155 | -- in the pattern before the dollar sign if you want to match a line that contanis | |
156 | -- a newline character. | |
157 | -- The DOLLAR_ENDONLY option is ignored if MULTILINE is set. | |
158 | -- There is no way to set this option within a pattern. | |
159 | -- This is passed to [[:new]]. | |
160 | ||
161 | --**** | |
162 | -- Signature: | |
163 | -- public constant DOTALL | |
164 | -- | |
165 | -- Description: | |
166 | -- With this option the '.' character also matches a newline sequence. | |
167 | -- This is passed to [[:new]]. | |
168 | ||
169 | --**** | |
170 | -- Signature: | |
171 | -- public constant DUPNAMES | |
172 | -- | |
173 | -- Description: | |
174 | -- Allow duplicate names for named subpatterns. | |
175 | -- Since there is no way to access named subpatterns this flag has no effect. | |
176 | -- This is passed to [[:new]]. | |
177 | ||
178 | --**** | |
179 | -- Signature: | |
180 | -- public constant EXTENDED | |
181 | -- | |
182 | -- Description: | |
183 | -- Whitespace and characters beginning with a hash mark to the end of the line | |
184 | -- in the pattern will be ignored when searching except when the whitespace or hash | |
185 | -- is escaped or in a character class. | |
186 | -- This is passed to [[:new]]. | |
187 | ||
188 | --**** | |
189 | -- Signature: | |
190 | -- public constant EXTRA | |
191 | -- | |
192 | -- Description: | |
193 | -- When an alphanumeric follows a backslash(\) has no special meaning an | |
194 | -- error is generated. | |
195 | -- This is passed to [[:new]]. | |
196 | ||
197 | --**** | |
198 | -- Signature: | |
199 | -- public constant FIRSTLINE | |
200 | -- | |
201 | -- Description: | |
202 | -- If PCRE_FIRSTLINE is set, the match must happen before or at the first | |
203 | -- newline in the subject (though it may continue over the newline). | |
204 | -- This is passed to [[:new]]. | |
205 | ||
206 | --**** | |
207 | -- Signature: | |
208 | -- public constant MULTILINE | |
209 | -- | |
210 | -- Description: | |
211 | -- When MULTILINE it is set, the "start of line" and "end of line" | |
212 | -- constructs match immediately following or immediately before internal | |
213 | -- newlines in the subject string, respectively, as well as at the very | |
214 | -- start and end. This is passed to [[:new]]. | |
215 | ||
216 | --**** | |
217 | -- Signature: | |
218 | -- public constant NEWLINE_CR | |
219 | -- | |
220 | -- Description: | |
221 | -- Sets CR as the NEWLINE sequence. | |
222 | -- The NEWLINE sequence will match $ | |
223 | -- when MULTILINE is set. | |
224 | -- This is passed to all routines including [[:new]]. | |
225 | ||
226 | --**** | |
227 | -- Signature: | |
228 | -- public constant NEWLINE_LF | |
229 | -- | |
230 | -- Description: | |
231 | -- Sets LF as the NEWLINE sequence. | |
232 | -- The NEWLINE sequence will match $ | |
233 | -- when MULTILINE is set. | |
234 | -- This is passed to all routines including [[:new]]. | |
235 | ||
236 | --**** | |
237 | -- Signature: | |
238 | -- public constant NEWLINE_CRLF | |
239 | -- | |
240 | -- Description: | |
241 | -- Sets CRLF as the NEWLINE sequence | |
242 | -- The NEWLINE sequence will match $ | |
243 | -- when MULTILINE is set. | |
244 | -- This is passed to all routines including [[:new]]. | |
245 | ||
246 | --**** | |
247 | -- Signature: | |
248 | -- public constant NEWLINE_ANY | |
249 | -- | |
250 | -- Description: | |
251 | -- Sets ANY newline sequence as the NEWLINE sequence including | |
252 | -- those from UNICODE when UTF8 is also set. The string will have | |
253 | -- to be encoded as UTF8, however. | |
254 | -- The NEWLINE sequence will match $ | |
255 | -- when MULTILINE is set. | |
256 | -- This is passed to all routines including [[:new]]. | |
257 | ||
258 | --**** | |
259 | -- Signature: | |
260 | -- public constant NEWLINE_ANYCRLF | |
261 | -- | |
262 | -- Description: | |
263 | -- Sets ANY newline sequence from ASCII. | |
264 | -- The NEWLINE sequence will match $ | |
265 | -- when MULTILINE is set. | |
266 | -- This is passed to all routines including [[:new]]. | |
267 | ||
268 | --**** | |
269 | -- Signature: | |
270 | -- public constant NOTBOL | |
271 | -- | |
272 | -- Description: | |
273 | -- This indicates that beginning of the passed string does **NOT** start | |
274 | -- at the **B**eginning **O**f a **L**ine (NOTBOL), so a carrot symbol (^) in the | |
275 | -- original pattern will not match the beginning of the string. | |
276 | -- This is used by routines other than [[:new]]. | |
277 | ||
278 | --**** | |
279 | -- Signature: | |
280 | -- public constant NOTEOL | |
281 | -- | |
282 | -- Description: | |
283 | -- This indicates that end of the passed string does **NOT** end | |
284 | -- at the **E**nd **O**f a **L**ine (NOTEOL), so a dollar sign ($) in the | |
285 | -- original pattern will not match the end of the string. | |
286 | -- This is used by routines other than [[:new]]. | |
287 | ||
288 | --**** | |
289 | -- Signature: | |
290 | -- public constant NO_AUTO_CAPTURE | |
291 | -- | |
292 | -- Description: | |
293 | -- Disables capturing subpatterns except when the subpatterns are | |
294 | -- named. | |
295 | -- This is passed to [[:new]]. | |
296 | ||
297 | --**** | |
298 | -- Signature: | |
299 | -- public constant NO_UTF8_CHECK | |
300 | -- | |
301 | -- Description: | |
302 | -- Turn off checking for the validity of your UTF string. Use this | |
303 | -- with caution. An invalid utf8 string with this option could **crash** | |
304 | -- your program. Only use this if you know the string is a valid utf8 string. | |
305 | -- See [[:unicode:validate]]. | |
306 | -- This is passed to all routines including [[:new]]. | |
307 | ||
308 | --**** | |
309 | -- Signature: | |
310 | -- public constant NOTEMPTY | |
311 | -- | |
312 | -- Description: | |
313 | -- Here matches of empty strings will not be allowed. In C, this is PCRE_NOTEMPTY. | |
314 | -- The pattern: `A*a*` will match "AAAA", "aaaa", and "Aaaa" but not "". | |
315 | -- This is used by routines other than [[:new]]. | |
316 | ||
317 | --**** | |
318 | -- Signature: | |
319 | -- public constant PARTIAL | |
320 | -- | |
321 | -- Description: | |
322 | -- This option has no effect with these routines. Refer to the C documentation | |
323 | -- for what it does in C. | |
324 | -- In C, this constant is called PCRE_PARTIAL. | |
325 | -- This is used by routines other than [[:new]]. | |
326 | ||
327 | --**** | |
328 | -- Signature: | |
329 | -- public constant STRING_OFFSETS | |
330 | -- | |
331 | -- Description: | |
332 | -- This is used by [[:matches]] and [[:all_matches]]. | |
333 | ||
334 | --**** | |
335 | -- Signature: | |
336 | -- public constant UNGREEDY | |
337 | -- This modifier sets the pattern such that quantifiers are | |
338 | -- not greedy by default, but become greedy if followed by a question mark. | |
339 | -- | |
340 | -- Description: | |
341 | -- This is passed to [[:new]]. | |
342 | ||
343 | --**** | |
344 | -- Signature: | |
345 | -- public constant UTF8 | |
346 | -- | |
347 | -- Description: | |
348 | -- Makes strings passed in to be interpreted as a UTF8 encoded string. | |
349 | -- This is passed to [[:new]]. | |
350 | ||
351 | public constant | |
352 | 6 | DEFAULT = #00000000, |
353 | 6 | CASELESS = #00000001, |
354 | 6 | MULTILINE = #00000002, |
355 | 6 | DOTALL = #00000004, |
356 | 6 | EXTENDED = #00000008, |
357 | 6 | ANCHORED = #00000010, |
358 | 6 | DOLLAR_ENDONLY = #00000020, |
359 | 6 | EXTRA = #00000040, |
360 | 6 | NOTBOL = #00000080, |
361 | 6 | NOTEOL = #00000100, |
362 | 6 | UNGREEDY = #00000200, |
363 | 6 | NOTEMPTY = #00000400, |
364 | 6 | UTF8 = #00000800, |
365 | 6 | NO_AUTO_CAPTURE = #00001000, |
366 | 6 | NO_UTF8_CHECK = #00002000, |
367 | 6 | AUTO_CALLOUT = #00004000, |
368 | 6 | PARTIAL = #00008000, |
369 | 6 | DFA_SHORTEST = #00010000, |
370 | 6 | DFA_RESTART = #00020000, |
371 | 6 | FIRSTLINE = #00040000, |
372 | 6 | DUPNAMES = #00080000, |
373 | 6 | NEWLINE_CR = #00100000, |
374 | 6 | NEWLINE_LF = #00200000, |
375 | 6 | NEWLINE_CRLF = #00300000, |
376 | 6 | NEWLINE_ANY = #00400000, |
377 | 6 | NEWLINE_ANYCRLF = #00500000, |
378 | 6 | BSR_ANYCRLF = #00800000, |
379 | 6 | BSR_UNICODE = #01000000, |
380 | 6 | STRING_OFFSETS = #0C000000 |
381 | ||
382 | 6 | constant option_names = { |
383 | { DEFAULT, "DEFAULT" }, | |
384 | { CASELESS, "CASELESS" }, | |
385 | { MULTILINE, "MULTILINE" }, | |
386 | { DOTALL, "DOTALL" }, | |
387 | { EXTENDED, "EXTENDED" }, | |
388 | { ANCHORED, "ANCHORED" }, | |
389 | { DOLLAR_ENDONLY, "DOLLAR_ENDONLY" }, | |
390 | { EXTRA, "EXTRA" }, | |
391 | { NOTBOL, "NOTBOL" }, | |
392 | { NOTEOL, "NOTEOL" }, | |
393 | { UNGREEDY, "UNGREEDY" }, | |
394 | { NOTEMPTY, "NOTEMPTY" }, | |
395 | { UTF8, "UTF8" }, | |
396 | { NO_AUTO_CAPTURE, "NO_AUTO_CAPTURE" }, | |
397 | { NO_UTF8_CHECK, "NO_UTF8_CHECK" }, | |
398 | { AUTO_CALLOUT, "AUTO_CALLOUT" }, | |
399 | { PARTIAL, "PARTIAL" }, | |
400 | { DFA_SHORTEST, "DFA_SHORTEST" }, | |
401 | { DFA_RESTART, "DFA_RESTART" }, | |
402 | { FIRSTLINE, "FIRSTLINE" }, | |
403 | { DUPNAMES, "DUPNAMES" }, | |
404 | { NEWLINE_CR, "NEWLINE_CR" }, | |
405 | { NEWLINE_LF, "NEWLINE_LF" }, | |
406 | { NEWLINE_CRLF, "NEWLINE_CRLF" }, | |
407 | { NEWLINE_ANY, "NEWLINE_ANY" }, | |
408 | { NEWLINE_ANYCRLF, "NEWLINE_ANYCRLF" }, | |
409 | { BSR_ANYCRLF, "BSR_ANYCRLF" }, | |
410 | { BSR_UNICODE, "BSR_UNICODE" }, | |
411 | { STRING_OFFSETS, "STRING_OFFSETS" } | |
412 | } | |
413 | ||
414 | --**** | |
415 | -- === Error Constants | |
416 | ||
417 | public constant | |
418 | 6 | ERROR_NOMATCH = (-1), |
419 | 6 | ERROR_NULL = (-2), |
420 | 6 | ERROR_BADOPTION = (-3), |
421 | 6 | ERROR_BADMAGIC = (-4), |
422 | 6 | ERROR_UNKNOWN_OPCODE = (-5), |
423 | 6 | ERROR_UNKNOWN_NODE = (-5), |
424 | 6 | ERROR_NOMEMORY = (-6), |
425 | 6 | ERROR_NOSUBSTRING = (-7), |
426 | 6 | ERROR_MATCHLIMIT = (-8), |
427 | 6 | ERROR_CALLOUT = (-9), |
428 | 6 | ERROR_BADUTF8 = (-10), |
429 | 6 | ERROR_BADUTF8_OFFSET = (-11), |
430 | 6 | ERROR_PARTIAL = (-12), |
431 | 6 | ERROR_BADPARTIAL = (-13), |
432 | 6 | ERROR_INTERNAL = (-14), |
433 | 6 | ERROR_BADCOUNT = (-15), |
434 | 6 | ERROR_DFA_UITEM = (-16), |
435 | 6 | ERROR_DFA_UCOND = (-17), |
436 | 6 | ERROR_DFA_UMLIMIT = (-18), |
437 | 6 | ERROR_DFA_WSSIZE = (-19), |
438 | 6 | ERROR_DFA_RECURSE = (-20), |
439 | 6 | ERROR_RECURSIONLIMIT = (-21), |
440 | 6 | ERROR_NULLWSLIMIT = (-22), |
441 | 6 | ERROR_BADNEWLINE = (-23) |
442 | ||
443 | 6 | public constant error_names = { |
444 | {ERROR_NOMATCH ,"ERROR_NOMATCH"}, | |
445 | {ERROR_NULL ,"ERROR_NULL"}, | |
446 | {ERROR_BADOPTION ,"ERROR_BADOPTION"}, | |
447 | {ERROR_BADMAGIC ,"ERROR_BADMAGIC"}, | |
448 | {ERROR_UNKNOWN_OPCODE ,"ERROR_UNKNOWN_OPCODE/NODE"}, | |
449 | {ERROR_UNKNOWN_NODE ,"ERROR_UNKNOWN_OPCODE/NODE"}, | |
450 | {ERROR_NOMEMORY ,"ERROR_NOMEMORY"}, | |
451 | {ERROR_NOSUBSTRING ,"ERROR_NOSUBSTRING"}, | |
452 | {ERROR_MATCHLIMIT ,"ERROR_MATCHLIMIT"}, | |
453 | {ERROR_CALLOUT ,"ERROR_CALLOUT"}, | |
454 | {ERROR_BADUTF8 ,"ERROR_BADUTF8"}, | |
455 | {ERROR_BADUTF8_OFFSET ,"ERROR_BADUTF8_OFFSET"}, | |
456 | {ERROR_PARTIAL ,"ERROR_PARTIAL"}, | |
457 | {ERROR_BADPARTIAL ,"ERROR_BADPARTIAL"}, | |
458 | {ERROR_INTERNAL ,"ERROR_INTERNAL"}, | |
459 | {ERROR_BADCOUNT ,"ERROR_BADCOUNT"}, | |
460 | {ERROR_DFA_UITEM ,"ERROR_DFA_UITEM"}, | |
461 | {ERROR_DFA_UCOND ,"ERROR_DFA_UCOND"}, | |
462 | {ERROR_DFA_UMLIMIT ,"ERROR_DFA_UMLIMIT"}, | |
463 | {ERROR_DFA_WSSIZE ,"ERROR_DFA_WSSIZE"}, | |
464 | {ERROR_DFA_RECURSE ,"ERROR_DFA_RECURSE"}, | |
465 | {ERROR_RECURSIONLIMIT ,"ERROR_RECURSIONLIMIT"}, | |
466 | {ERROR_NULLWSLIMIT ,"ERROR_NULLWSLIMIT"}, | |
467 | {ERROR_BADNEWLINE ,"ERROR_BADNEWLINE"} | |
468 | } | |
469 | ||
470 | 6 | constant all_options = or_all({ |
471 | DEFAULT , | |
472 | CASELESS , | |
473 | MULTILINE , | |
474 | DOTALL , | |
475 | EXTENDED , | |
476 | ANCHORED , | |
477 | DOLLAR_ENDONLY , | |
478 | EXTRA , | |
479 | NOTBOL , | |
480 | NOTEOL , | |
481 | UNGREEDY , | |
482 | NOTEMPTY , | |
483 | UTF8 , | |
484 | NO_AUTO_CAPTURE , | |
485 | NO_UTF8_CHECK , | |
486 | AUTO_CALLOUT , | |
487 | PARTIAL , | |
488 | DFA_SHORTEST , | |
489 | DFA_RESTART , | |
490 | FIRSTLINE , | |
491 | DUPNAMES , | |
492 | NEWLINE_CR , | |
493 | NEWLINE_LF , | |
494 | NEWLINE_CRLF , | |
495 | NEWLINE_ANY , | |
496 | NEWLINE_ANYCRLF , | |
497 | BSR_ANYCRLF , | |
498 | BSR_UNICODE , | |
499 | STRING_OFFSETS}) | |
500 | ||
501 | ||
502 | --**** | |
503 | -- === Create/Destroy | |
504 | ||
505 | --** | |
506 | -- Regular expression type | |
507 | ||
508 | 242 | |
509 | 242 | return sequence(o) |
510 | end type | |
511 | ||
512 | --** | |
513 | -- Regular expression option specification type | |
514 | -- | |
515 | -- Although the functions do not use this type (they return an error instead), | |
516 | -- you can use this to check if your routine is receiving something sane. | |
517 | 207 | |
518 | 207 | if atom(o) then |
519 | 201 | if not integer(o) then |
520 | 0 | return 0 |
521 | else | |
522 | 201 | if (or_bits(o,all_options) != all_options) then |
523 | 0 | return 0 |
524 | else | |
525 | 201 | return 1 |
526 | end if | |
527 | end if | |
528 | 6 | elsif integer_array(o) then |
529 | 6 | return option_spec(or_all(o)) |
530 | else | |
531 | 0 | return 0 |
532 | end if | |
533 | end type | |
534 | ||
535 | --** | |
536 | -- Converts an option spec to a string. | |
537 | -- | |
538 | -- This can be useful for debugging what options were passed in. | |
539 | -- Without it you have to convert a number to hex and lookup the | |
540 | -- constants in the source code. | |
541 | 1 | |
542 | 1 | return flags:flags_to_string(o, option_names) |
543 | end function | |
544 | ||
545 | --** | |
546 | -- Converts an regex error to a string. | |
547 | -- | |
548 | -- This can be useful for debugging and even something rough to give to | |
549 | -- the user incase of a regex failure. It's preferable to | |
550 | -- a number. | |
551 | -- | |
552 | -- See Also: | |
553 | -- [[:error_message]] | |
554 | 26 | |
555 | 26 | if i >= 0 or i < -23 then |
556 | 2 | return sprintf("%d",{i}) |
557 | else | |
558 | 24 | return vlookup(i, error_names, 1, 2, "Unknown Error") |
559 | end if | |
560 | end function | |
561 | ||
562 | --** | |
563 | -- Return an allocated regular expression | |
564 | -- | |
565 | -- Parameters: | |
566 | -- # ##pattern## : a sequence representing a human readable regular expression | |
567 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Compile Time Option Constants]]. | |
568 | -- | |
569 | -- Returns: | |
570 | -- A **regex**, which other regular expression routines can work on or an atom to indicate an | |
571 | -- error. If an error, you can call [[:error_message]] to get a detailed error message. | |
572 | -- | |
573 | -- Comments: | |
574 | -- This is the only routine that accepts a human readable regular expression. The string is | |
575 | -- compiled and a [[:regex]] is returned. Analyzing and compiling a regular expression is a | |
576 | -- costly operation and should not be done more than necessary. For instance, if your application | |
577 | -- looks for an email address among text frequently, you should create the regular expression | |
578 | -- as a constant accessible to your source code and any files that may use it, thus, the regular | |
579 | -- expression is analyzed and compiled only once per run of your application. | |
580 | -- | |
581 | -- | |
582 | -- -- Bad Example | |
583 | -- include std/regex.e as re | |
584 | -- | |
585 | -- while sequence(line) do | |
586 | -- re:regex proper_name = re:new("[A-Z][a-z]+ [A-Z][a-z]+") | |
587 | -- if re:find(proper_name, line) then | |
588 | -- -- code | |
589 | -- end if | |
590 | -- end while | |
591 | -- | |
592 | -- | |
593 | -- | |
594 | -- -- Good Example | |
595 | -- include std/regex.e as re | |
596 | -- constant re_proper_name = re:new("[A-Z][a-z]+ [A-Z][a-z]+") | |
597 | -- while sequence(line) do | |
598 | -- if re:find(re_proper_name, line) then | |
599 | -- -- code | |
600 | -- end if | |
601 | -- end while | |
602 | -- | |
603 | -- | |
604 | -- Example 1: | |
605 | -- | |
606 | -- include std/regex.e as re | |
607 | -- re:regex number = re:new("[0-9]+") | |
608 | -- | |
609 | -- | |
610 | -- Note: | |
611 | -- For simple matches, the built-in Euphoria | |
612 | -- routine [[:eu:match]] and the library routine [[:wildcard:is_match]] | |
613 | -- are often times easier to use and | |
614 | -- a little faster. Regular expressions are faster for complex searching/matching. | |
615 | -- | |
616 | -- See Also: | |
617 | -- [[:error_message]], [[:find]], [[:find_all]] | |
618 | ||
619 | 47 | |
620 | 47 | if sequence(options) then options = or_all(options) end if |
621 | ||
622 | -- concatenation ensures we really get a new sequence, and don't just use the | |
623 | -- one passed in, which could be another regex previously created...this may | |
624 | -- be a bug with the refcount/delete_instance/regex code | |
625 | 47 | return machine_func(M_PCRE_COMPILE, { pattern, options }) |
626 | end function | |
627 | ||
628 | --** | |
629 | -- If ##[[:new]]## returns an atom, this function will return a text error message | |
630 | -- as to the reason. | |
631 | -- | |
632 | -- Parameters: | |
633 | -- # ##re##: Regular expression to get the error message from | |
634 | -- | |
635 | -- Returns: | |
636 | -- An atom (0) when no error message exists, otherwise a sequence describing the error. | |
637 | -- | |
638 | -- Example 1: | |
639 | -- | |
640 | -- include std/regex.e | |
641 | -- object r = regex:new("[A-Z[a-z]*") | |
642 | -- if atom(r) then | |
643 | -- printf(1, "Regex failed to compile: %s\n", { regex:error_message(r) }) | |
644 | -- end if | |
645 | -- | |
646 | -- | |
647 | ||
648 | 2 | |
649 | 2 | return machine_func(M_PCRE_ERROR_MESSAGE, { re }) |
650 | end function | |
651 | ||
652 | --**** | |
653 | -- === Utility Routines | |
654 | -- | |
655 | ||
656 | --** | |
657 | -- Escape special regular expression characters that may be entered into a search | |
658 | -- string from user input. | |
659 | -- | |
660 | -- Notes: | |
661 | -- Special regex characters are: | |
662 | -- {{{ | |
663 | -- . \ + * ? [ ^ ] $ ( ) { } = ! < > | : - | |
664 | -- }}} | |
665 | -- | |
666 | -- Parameters: | |
667 | -- # ##s##: string sequence to escape | |
668 | -- | |
669 | -- Returns: | |
670 | -- An escaped ##sequence## representing ##s##. | |
671 | -- | |
672 | -- Example 1: | |
673 | -- | |
674 | -- include std/regex.e as re | |
675 | -- sequence search_s = re:escape("Payroll is $***15.00") | |
676 | -- -- search_s = "Payroll is \\$\\*\\*\\*15\\.00" | |
677 | -- | |
678 | -- | |
679 | ||
680 | 1 | |
681 | 1 | return text:escape(s, ".\\+*?[^]$(){}=!<>|:-") |
682 | end function | |
683 | ||
684 | --** | |
685 | -- Returns the number of capturing subpatterns (the ovector size) for a regex | |
686 | -- | |
687 | -- Parameters: | |
688 | -- # ##ex## : a regex | |
689 | -- # ##maxsize## : optional maximum number of named groups to get data from | |
690 | -- | |
691 | -- Returns: | |
692 | -- An **integer** | |
693 | -- | |
694 | ||
695 | 85 | |
696 | ||
697 | 85 | integer m = machine_func(M_PCRE_GET_OVECTOR_SIZE, {ex}) |
698 | 85 | if (m > maxsize) then |
699 | 0 | return maxsize |
700 | end if | |
701 | 85 | return m+1 |
702 | end function | |
703 | ||
704 | --**** | |
705 | -- === Match | |
706 | ||
707 | --** | |
708 | -- Return the first match of ##re## in ##haystack##. You can optionally start at the position | |
709 | -- ##from##. | |
710 | -- | |
711 | -- Parameters: | |
712 | -- # ##re## : a regex for a subject to be matched against | |
713 | -- # ##haystack## : a string in which to searched | |
714 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
715 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
716 | -- The only options that | |
717 | -- may be set when calling find are [[:ANCHORED]], [[:NEWLINE_CR]], [[:NEWLINE_LF]], | |
718 | -- [[:NEWLINE_CRLF]], [[:NEWLINE_ANY]] [[:NEWLINE_ANYCRLF]] [[:NOTBOL]], [[:NOTEOL]], | |
719 | -- [[:NOTEMPTY]], and [[:NO_UTF8_CHECK]]. | |
720 | -- ##options## can be any match time option or a | |
721 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
722 | -- any two valid option values. | |
723 | -- # ##size## : internal (how large an array the C backend should allocate). Defaults to 90, in rare cases this number may need to be increased in order to accomodate complex regex expressions. | |
724 | -- | |
725 | -- Returns: | |
726 | -- An **object**, which is either an atom of 0, meaning nothing matched or a sequence of matched pairs. | |
727 | -- For the explanation of the returned sequence, please see the first example. | |
728 | -- | |
729 | -- Example 1: | |
730 | -- | |
731 | -- include std/regex.e as re | |
732 | -- r = re:new("([A-Za-z]+) ([0-9]+)") -- John 20 or Jane 45 | |
733 | -- object result = re:find(r, "John 20") | |
734 | -- | |
735 | -- -- The return value will be: | |
736 | -- -- { | |
737 | -- -- { 1, 7 }, -- Total match | |
738 | -- -- { 1, 4 }, -- First grouping "John" ([A-Za-z]+) | |
739 | -- -- { 6, 7 } -- Second grouping "20" ([0-9]+) | |
740 | -- -- } | |
741 | -- | |
742 | -- | |
743 | ||
744 | 85 | |
745 | 85 | if sequence(options) then options = or_all(options) end if |
746 | 85 | if size < 0 then |
747 | 0 | size = 0 |
748 | end if | |
749 | ||
750 | 85 | return machine_func(M_PCRE_EXEC, { re, haystack, options, from, size }) |
751 | end function | |
752 | ||
753 | --** | |
754 | -- Return all matches of ##re## in ##haystack## optionally starting at the sequence position | |
755 | -- ##from##. | |
756 | -- | |
757 | -- Parameters: | |
758 | -- # ##re## : a regex for a subject to be matched against | |
759 | -- # ##haystack## : a string in which to searched | |
760 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
761 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
762 | -- | |
763 | -- Returns: | |
764 | -- A **sequence** of **sequences** that were returned by [[:find]] and in the case of | |
765 | -- no matches this returns an empty **sequence**. | |
766 | -- Please see [[:find]] for a detailed description of each member of the return | |
767 | -- sequence. | |
768 | -- | |
769 | -- Example 1: | |
770 | -- | |
771 | -- include std/regex.e as re | |
772 | -- constant re_number = re:new("[0-9]+") | |
773 | -- object matches = re:find_all(re_number, "10 20 30") | |
774 | -- | |
775 | -- -- matches is: | |
776 | -- -- { | |
777 | -- -- {{1, 2}}, | |
778 | -- -- {{4, 5}}, | |
779 | -- -- {{7, 8}} | |
780 | -- -- } | |
781 | -- | |
782 | -- | |
783 | ||
784 | 16 | |
785 | 16 | if sequence(options) then options = or_all(options) end if |
786 | ||
787 | 16 | object result |
788 | 16 | sequence results = {} |
789 | 16 | while sequence(result) with entry do |
790 | 27 | results = append(results, result) |
791 | 27 | from = max(result) + 1 |
792 | ||
793 | 27 | if from > length(haystack) then |
794 | 10 | exit |
795 | end if | |
796 | entry | |
797 | 33 | result = find(re, haystack, from, options) |
798 | 33 | end while |
799 | ||
800 | 16 | return results |
801 | end function | |
802 | ||
803 | --** | |
804 | -- Determine if ##re## matches any portion of ##haystack##. | |
805 | -- | |
806 | -- Parameters: | |
807 | -- # ##re## : a regex for a subject to be matched against | |
808 | -- # ##haystack## : a string in which to searched | |
809 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
810 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
811 | -- ##options## can be any match time option or a | |
812 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
813 | -- any two valid option values. | |
814 | -- | |
815 | -- Returns: | |
816 | -- An **atom**, 1 if ##re## matches any portion of ##haystack## or 0 if not. | |
817 | -- | |
818 | ||
819 | 2 | |
820 | 2 | return sequence(find(re, haystack, from, options)) |
821 | end function | |
822 | ||
823 | --** | |
824 | -- Determine if the entire ##haystack## matches ##re##. | |
825 | -- | |
826 | -- Parameters: | |
827 | -- # ##re## : a regex for a subject to be matched against | |
828 | -- # ##haystack## : a string in which to searched | |
829 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
830 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
831 | -- ##options## can be any match time option or a | |
832 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
833 | -- any two valid option values. | |
834 | -- | |
835 | -- Returns: | |
836 | -- An **atom**, 1 if ##re## matches the entire ##haystack## or 0 if not. | |
837 | -- | |
838 | ||
839 | 15 | |
840 | 15 | object m = find(re, haystack, from, options) |
841 | ||
842 | 15 | if sequence(m) and length(m) > 0 and m[1][1] = 1 and m[1][2] = length(haystack) then |
843 | 9 | return 1 |
844 | end if | |
845 | ||
846 | 6 | return 0 |
847 | end function | |
848 | ||
849 | --** | |
850 | -- Get the matched text only. | |
851 | -- | |
852 | -- Parameters: | |
853 | -- # ##re## : a regex for a subject to be matched against | |
854 | -- # ##haystack## : a string in which to searched | |
855 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
856 | -- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
857 | -- ##options## can be any match time option or STRING_OFFSETS or a | |
858 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
859 | -- any two valid option values. | |
860 | -- | |
861 | -- Returns: | |
862 | -- Returns a **sequence** of strings, the first being the entire match and subsequent | |
863 | -- items being each of the captured groups or **ERROR_NOMATCH** of there is no match. | |
864 | -- The size of the sequence is the number | |
865 | -- of groups in the expression plus one (for the entire match). | |
866 | -- | |
867 | -- If ##options## contains the bit [[:STRING_OFFSETS]], then the result is different. | |
868 | -- For each item, a sequence is returned containing the matched text, the starting | |
869 | -- index in ##haystack## and the ending index in ##haystack##. | |
870 | -- | |
871 | -- Example 1: | |
872 | -- | |
873 | -- include std/regex.e as re | |
874 | -- constant re_name = re:new("([A-Z][a-z]+) ([A-Z][a-z]+)") | |
875 | -- | |
876 | -- object matches = re:matches(re_name, "John Doe and Jane Doe") | |
877 | -- -- matches is: | |
878 | -- -- { | |
879 | -- -- "John Doe", -- full match data | |
880 | -- -- "John", -- first group | |
881 | -- -- "Doe" -- second group | |
882 | -- -- } | |
883 | -- | |
884 | -- matches = re:matches(re_name, "John Doe and Jane Doe", re:STRING_OFFSETS) | |
885 | -- -- matches is: | |
886 | -- -- { | |
887 | -- -- { "John Doe", 1, 8 }, -- full match data | |
888 | -- -- { "John", 1, 4 }, -- first group | |
889 | -- -- { "Doe", 6, 8 } -- second group | |
890 | -- -- } | |
891 | -- | |
892 | -- | |
893 | -- See Also: | |
894 | -- [[:all_matches]] | |
895 | -- | |
896 | 9 | |
897 | 9 | if sequence(options) then options = or_all(options) end if |
898 | 9 | integer str_offsets = and_bits(STRING_OFFSETS, options) |
899 | 9 | object match_data = find(re, haystack, from, and_bits(options, not_bits(STRING_OFFSETS))) |
900 | ||
901 | 9 | if atom(match_data) then return ERROR_NOMATCH end if |
902 | ||
903 | 5 | for i = 1 to length(match_data) do |
904 | 19 | sequence tmp |
905 | 19 | if match_data[i][1] = 0 then |
906 | 0 | tmp = "" |
907 | else | |
908 | 19 | tmp = haystack[match_data[i][1]..match_data[i][2]] |
909 | end if | |
910 | 19 | if str_offsets then |
911 | 3 | match_data[i] = { tmp, match_data[i][1], match_data[i][2] } |
912 | else | |
913 | 16 | match_data[i] = tmp |
914 | end if | |
915 | 19 | end for |
916 | ||
917 | 5 | return match_data |
918 | end function | |
919 | ||
920 | --** | |
921 | -- Get the text of all matches | |
922 | -- | |
923 | -- Parameters: | |
924 | -- # ##re## : a regex for a subject to be matched against | |
925 | -- # ##haystack## : a string in which to searched | |
926 | -- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1 | |
927 | -- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
928 | -- ##options## can be any match time option or a | |
929 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
930 | -- any two valid option values. | |
931 | -- | |
932 | -- Returns: | |
933 | -- Returns **ERROR_NOMATCH** if there are no matches, or a **sequence** of **sequences** of | |
934 | -- **strings** if there is at least one match. In each member sequence of the returned sequence, | |
935 | -- the first string is the entire match and subsequent items being each of the | |
936 | -- captured groups. The size of the sequence is | |
937 | -- the number of groups in the expression plus one (for the entire match). In other words, | |
938 | -- each member of the return value will be of the same structure of that is returned by | |
939 | -- [[:matches]]. | |
940 | -- | |
941 | -- If ##options## contains the bit [[:STRING_OFFSETS]], then the result is different. | |
942 | -- In each member sequence, instead of each member being a string each member is itself a sequence | |
943 | -- containing the matched text, the starting index in ##haystack## and the ending | |
944 | -- index in ##haystack##. | |
945 | -- | |
946 | -- Example 1: | |
947 | -- | |
948 | -- include std/regex.e as re | |
949 | -- constant re_name = re:new("([A-Z][a-z]+) ([A-Z][a-z]+)") | |
950 | -- | |
951 | -- object matches = re:match_all(re_name, "John Doe and Jane Doe") | |
952 | -- -- matches is: | |
953 | -- -- { | |
954 | -- -- { -- first match | |
955 | -- -- "John Doe", -- full match data | |
956 | -- -- "John", -- first group | |
957 | -- -- "Doe" -- second group | |
958 | -- -- }, | |
959 | -- -- { -- second match | |
960 | -- -- "Jane Doe", -- full match data | |
961 | -- -- "Jane", -- first group | |
962 | -- -- "Doe" -- second group | |
963 | -- -- } | |
964 | -- -- } | |
965 | -- | |
966 | -- matches = re:match_all(re_name, "John Doe and Jane Doe", re:STRING_OFFSETS) | |
967 | -- -- matches is: | |
968 | -- -- { | |
969 | -- -- { -- first match | |
970 | -- -- { "John Doe", 1, 8 }, -- full match data | |
971 | -- -- { "John", 1, 4 }, -- first group | |
972 | -- -- { "Doe", 6, 8 } -- second group | |
973 | -- -- }, | |
974 | -- -- { -- second match | |
975 | -- -- { "Jane Doe", 14, 21 }, -- full match data | |
976 | -- -- { "Jane", 14, 17 }, -- first group | |
977 | -- -- { "Doe", 19, 21 } -- second group | |
978 | -- -- } | |
979 | -- -- } | |
980 | -- | |
981 | -- | |
982 | -- See Also: | |
983 | -- [[:matches]] | |
984 | ||
985 | 3 | |
986 | 3 | if sequence(options) then options = or_all(options) end if |
987 | 3 | integer str_offsets = and_bits(STRING_OFFSETS, options) |
988 | 3 | object match_data = find_all(re, haystack, from, and_bits(options, not_bits(STRING_OFFSETS))) |
989 | ||
990 | 3 | if length(match_data) = 0 then return ERROR_NOMATCH end if |
991 | ||
992 | 2 | for i = 1 to length(match_data) do |
993 | 4 | for j = 1 to length(match_data[i]) do |
994 | 12 | sequence tmp = haystack[match_data[i][j][1]..match_data[i][j][2]] |
995 | 12 | if str_offsets then |
996 | 6 | match_data[i][j] = { tmp, match_data[i][j][1], match_data[i][j][2] } |
997 | else | |
998 | 6 | match_data[i][j] = tmp |
999 | end if | |
1000 | 12 | end for |
1001 | 4 | end for |
1002 | ||
1003 | 2 | return match_data |
1004 | end function | |
1005 | ||
1006 | --**** | |
1007 | -- === Splitting | |
1008 | ||
1009 | --** | |
1010 | -- Split a string based on a regex as a delimiter | |
1011 | -- | |
1012 | -- Parameters: | |
1013 | -- # ##re## : a regex which will be used for matching | |
1014 | -- # ##text## : a string on which search and replace will apply | |
1015 | -- # ##from## : optional start position | |
1016 | -- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
1017 | -- ##options## can be any match time option or a | |
1018 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
1019 | -- any two valid option values. | |
1020 | -- | |
1021 | -- Returns: | |
1022 | -- A **sequence** of string values split at the delimiter and if no delimiters were matched | |
1023 | -- this **sequence** will be a one member sequence equal to ##{text}##. | |
1024 | -- | |
1025 | -- Example 1: | |
1026 | -- | |
1027 | -- include std/regex.e as re | |
1028 | -- regex comma_space_re = re:new(`,\s`) | |
1029 | -- sequence data = re:split(comma_space_re, "euphoria programming, source code, reference data") | |
1030 | -- -- data is | |
1031 | -- -- { | |
1032 | -- -- "euphoria programming", | |
1033 | -- -- "source code", | |
1034 | -- -- "reference data" | |
1035 | -- -- } | |
1036 | -- | |
1037 | -- | |
1038 | ||
1039 | 1 | |
1040 | 1 | return split_limit(re, text, 0, from, options) |
1041 | end function | |
1042 | ||
1043 | 2 | |
1044 | 2 | if sequence(options) then options = or_all(options) end if |
1045 | 2 | sequence match_data = find_all(re, text, from, options), result |
1046 | 2 | integer last = 1 |
1047 | ||
1048 | 2 | if limit = 0 or limit > length(match_data) then |
1049 | 1 | limit = length(match_data) |
1050 | end if | |
1051 | ||
1052 | 2 | result = repeat(0, limit) |
1053 | ||
1054 | 2 | for i = 1 to limit do |
1055 | 3 | result[i] = text[last..match_data[i][1][1] - 1] |
1056 | 3 | last = match_data[i][1][2] + 1 |
1057 | 3 | end for |
1058 | ||
1059 | 2 | if last < length(text) then |
1060 | 2 | result &= { text[last..$] } |
1061 | end if | |
1062 | ||
1063 | 2 | return result |
1064 | end function | |
1065 | ||
1066 | --**** | |
1067 | -- === Replacement | |
1068 | -- | |
1069 | ||
1070 | --** | |
1071 | -- Replaces all matches of a regex with the replacement text. | |
1072 | -- | |
1073 | -- Parameters: | |
1074 | -- # ##re## : a regex which will be used for matching | |
1075 | -- # ##text## : a string on which search and replace will apply | |
1076 | -- # ##replacement## : a string, used to replace each of the full matches | |
1077 | -- # ##from## : optional start position | |
1078 | -- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
1079 | -- ##options## can be any match time option or a | |
1080 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
1081 | -- any two valid option values. | |
1082 | -- | |
1083 | -- Returns: | |
1084 | -- A **sequence**, the modified ##text##. If there is no match with ##re## the | |
1085 | -- return value will be the same as ##text## when it was passed in. | |
1086 | -- | |
1087 | -- Special replacement operators: | |
1088 | -- | |
1089 | -- * **##\##** ~-- Causes the next character to lose its special meaning. | |
1090 | -- * **##\n##** ~ -- Inserts a 0x0A (LF) character. | |
1091 | -- * **##\r##** ~-- Inserts a 0x0D (CR) character. | |
1092 | -- * **##\t##** ~-- Inserts a 0x09 (TAB) character. | |
1093 | -- * **##\1##** to **##\9##** ~-- Recalls stored substrings from registers (\1, \2, \3, to \9). | |
1094 | -- * **##\0##** ~-- Recalls entire matched pattern. | |
1095 | -- * **##\u##** ~-- Convert next character to uppercase | |
1096 | -- * **##\l##** ~-- Convert next character to lowercase | |
1097 | -- * **##\U##** ~-- Convert to uppercase till ##\E## or ##\e## | |
1098 | -- * **##\L##** ~-- Convert to lowercase till ##\E## or ##\e## | |
1099 | -- * **##\E##** or **##\e##** ~-- Terminate a ##{{{\\}}}U## or ##\L## conversion | |
1100 | -- | |
1101 | -- Example 1: | |
1102 | -- | |
1103 | -- include std/regex.e | |
1104 | -- regex r = new(`([A-Za-z]+)\.([A-Za-z]+)`) | |
1105 | -- sequence details = find_replace(r, "hello.txt", `Filename: \U\1\e Extension: \U\2\e`) | |
1106 | -- -- details = "Filename: HELLO Extension: TXT" | |
1107 | -- | |
1108 | -- | |
1109 | ||
1110 | 5 | |
1111 | option_spec options=DEFAULT) | |
1112 | 5 | return find_replace_limit(ex, text, replacement, -1, from, options) |
1113 | end function | |
1114 | ||
1115 | --** | |
1116 | -- Replaces up to ##limit## matches of ##ex## in ##text## except when ##limit## is 0. When | |
1117 | -- ##limit## is 0, this routine replaces all of the matches. | |
1118 | -- | |
1119 | -- This function is identical to [[:find_replace]] except it allows you to limit the number of | |
1120 | -- replacements to perform. Please see the documentation for [[:find_replace]] for all the | |
1121 | -- details. | |
1122 | -- | |
1123 | -- Parameters: | |
1124 | -- # ##re## : a regex which will be used for matching | |
1125 | -- # ##text## : a string on which search and replace will apply | |
1126 | -- # ##replacement## : a string, used to replace each of the full matches | |
1127 | -- # ##limit## : the number of matches to process | |
1128 | -- # ##from## : optional start position | |
1129 | -- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
1130 | -- ##options## can be any match time option or a | |
1131 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
1132 | -- any two valid option values. | |
1133 | -- | |
1134 | -- Returns: | |
1135 | -- A **sequence**, the modified ##text##. | |
1136 | -- | |
1137 | -- See Also: | |
1138 | -- [[:find_replace]] | |
1139 | -- | |
1140 | ||
1141 | 6 | |
1142 | integer limit, integer from=1, option_spec options=DEFAULT) | |
1143 | 6 | if sequence(options) then options = or_all(options) end if |
1144 | ||
1145 | 6 | return machine_func(M_PCRE_REPLACE, { ex, text, replacement, options, from, limit }) |
1146 | end function | |
1147 | ||
1148 | --** | |
1149 | -- When ##limit## is positive, | |
1150 | -- this routine replaces up to ##limit## matches of ##ex## in ##text## with the | |
1151 | -- result of the user | |
1152 | -- defined callback, ##rid##, and when ##limit## is 0, replaces | |
1153 | -- all matches of ##ex## in ##text## with the result of this user defined callback, ##rid##. | |
1154 | -- | |
1155 | -- The callback should take one sequence. The first member of this sequence will be a | |
1156 | -- a string | |
1157 | -- representing the entire match and the subsequent members, if they exist, | |
1158 | -- will be a strings | |
1159 | -- for the captured groups within the regular expression. | |
1160 | -- | |
1161 | -- Parameters: | |
1162 | -- # ##re## : a regex which will be used for matching | |
1163 | -- # ##text## : a string on which search and replace will apply | |
1164 | -- # ##rid## : routine id to execute for each match | |
1165 | -- # ##limit## : the number of matches to process | |
1166 | -- # ##from## : optional start position | |
1167 | -- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]]. | |
1168 | -- ##options## can be any match time option or a | |
1169 | -- sequence of valid options or it can be a value that comes from using or_bits on | |
1170 | -- any two valid option values. | |
1171 | -- | |
1172 | -- Returns: | |
1173 | -- A **sequence**, the modified ##text##. | |
1174 | -- | |
1175 | -- Example 1: | |
1176 | -- | |
1177 | -- include std/regex.e as re | |
1178 | -- function my_convert(sequence params) | |
1179 | -- switch params[1] do | |
1180 | -- case "1" then | |
1181 | -- return "one " | |
1182 | -- case "2" then | |
1183 | -- return "two " | |
1184 | -- case else | |
1185 | -- return "unknown " | |
1186 | -- end switch | |
1187 | -- end function | |
1188 | -- | |
1189 | -- regex r = re:new(`\d`) | |
1190 | -- sequence result = re:find_replace_callback(r, "125", routine_id("my_convert")) | |
1191 | -- -- result = "one two unknown " | |
1192 | -- | |
1193 | -- | |
1194 | ||
1195 | 3 | |
1196 | integer from=1, option_spec options=DEFAULT) | |
1197 | 3 | if sequence(options) then options = or_all(options) end if |
1198 | 3 | sequence match_data = find_all(ex, text, from, options), replace_data |
1199 | ||
1200 | 3 | if limit = 0 or limit > length(match_data) then |
1201 | 2 | limit = length(match_data) |
1202 | end if | |
1203 | 3 | replace_data = repeat(0, limit) |
1204 | ||
1205 | 3 | for i = 1 to limit do |
1206 | 4 | sequence params = repeat(0, length(match_data[i])) |
1207 | 4 | for j = 1 to length(match_data[i]) do |
1208 | 12 | params[j] = text[match_data[i][j][1]..match_data[i][j][2]] |
1209 | 12 | end for |
1210 | ||
1211 | 4 | replace_data[i] = call_func(rid, { params }) |
1212 | 4 | end for |
1213 | ||
1214 | 3 | for i = limit to 1 by -1 do |
1215 | 4 | text = replace(text, replace_data[i], match_data[i][1][1], match_data[i][1][2]) |
1216 | 4 | end for |
1217 | ||
1218 | 3 | return text |
1219 | end function |