COVERAGE SUMMARY

FILE SUMMARY

Name	Executed	Routines	%	Executed	Lines	%	Unexecuted
/home/matt/eu/rds/include/std/regex.e	19	19	100.00%	172	178	96.63%	6

ROUTINE SUMMARY

Routine	Executed	Lines		Unexecuted
option_spec()	7	10	70.00%	3
find()	4	5	80.00%	1
get_ovector_size()	4	5	80.00%	1
matches()	14	15	93.33%	1
all_matches()	14	14	100.00%	0
error_message()	2	2	100.00%	0
error_to_string()	4	4	100.00%	0
escape()	2	2	100.00%	0
find_all()	12	12	100.00%	0
find_replace()	2	2	100.00%	0
find_replace_callback()	17	17	100.00%	0
find_replace_limit()	3	3	100.00%	0
has_match()	2	2	100.00%	0
is_match()	5	5	100.00%	0
new()	3	3	100.00%	0
option_spec_to_string()	2	2	100.00%	0
regex()	2	2	100.00%	0
split()	2	2	100.00%	0
split_limit()	14	14	100.00%	0

LINE COVERAGE DETAIL

#	Executed
1		-- (c) Copyright - See License.txt
2		namespace regex
3
4		include std/math.e
5		include std/text.e
6		include std/types.e
7		include std/flags.e as flags
8		include std/error.e
9		include std/search.e
10
11		--****
12		-- == Regular Expressions
13		--
14		-- <>
15		--
16		-- === Introduction
17		--
18		-- Regular expressions in Euphoria are based on the PCRE (Perl Compatible Regular Expressions)
19		-- library created by Philip Hazel.
20		--
21		-- This document will detail the Euphoria interface to Regular Expressions, not really
22		-- regular expression syntax. It is a very complex subject that many books have been
23		-- written on. Here are a few good resources online that can help while learning
24		-- regular expressions.
25		--
26		-- * [[EUForum Article -> http://openeuphoria.org/wiki/euwiki.cgi?EuGuide%20Regular%20Expressions ]]
27		-- * [[Perl Regular Expressions Man Page -> http://perldoc.perl.org/perlre.html]]
28		-- * [[Regular Expression Library -> http://regexlib.com/]] (user supplied regular
29		-- expressions for just about any task).
30		-- * [[WikiPedia Regular Expression Article -> http://en.wikipedia.org/wiki/Regular_expression]]
31		-- * [[Man page of PCRE in HTML -> http://www.slabihoud.de/software/archives/pcrecompat.html]]
32		-- === General Use
33		--
34		-- Many functions take an optional ##options## parameter. This parameter can be either
35		-- a single option constant (see [[:Option Constants]]), multiple option constants or'ed
36		-- together into a single atom or a sequence of options, in which the function will take
37		-- care of ensuring the are or'ed together correctly. Options are like their C equivalents
38		-- with the 'PCRE_' prefix stripped off. Name spaces disambiguate symbols so we don't
39		-- need this prefix.
40		--
41		-- All strings passed into this library must be either 8-bit per character strings or
42		-- UTF which uses multiple bytes to encode UNICODE characters. You can
43		-- use UTF8 encoded UNICODE strings when you pass the UTF8 option.
44
45	6	enum M_PCRE_COMPILE=68, M_PCRE_FREE, M_PCRE_EXEC, M_PCRE_REPLACE, M_PCRE_ERROR_MESSAGE=95, M_PCRE_GET_OVECTOR_SIZE=97
46
47		--****
48		-- === Option Constants
49		--
50		-- ==== Compile Time and Match Time
51		--
52		-- When a regular expression object is created via ##new## we call also say it get's "compiled."
53		-- The options you may use for this are called "compile time" option constants. Once
54		-- the regular expression is created you can use the other functions that take this regular
55		-- expression and a string. These routines' options are called "match time" option constants.
56		-- To not set any options at all, do not supply the options argument or supply [[:DEFAULT]].
57		--
58		-- ===== Compile Time Option Constants
59		--
60		-- The only options that may set at "compile time"; that is, to pass to ##new##;
61		-- are [[:ANCHORED]], [[:AUTO_CALLOUT]], [[:BSR_ANYCRLF]], [[:BSR_UNICODE]], [[:CASELESS]],
62		-- [[:DEFAULT]], [[:DOLLAR_ENDONLY]], [[:DOTALL]], [[:DUPNAMES]], [[:EXTENDED]], [[:EXTRA]],
63		-- [[:FIRSTLINE]], [[:MULTILINE]], [[:NEWLINE_CR]], [[:NEWLINE_LF]], [[:NEWLINE_CRLF]],
64		-- [[:NEWLINE_ANY]], [[:NEWLINE_ANYCRLF]], [[:NO_AUTO_CAPTURE]], [[:NO_UTF8_CHECK]],
65		-- [[:UNGREEDY]], and [[:UTF8]].
66		--
67		--
68		-- ===== Match Time Option Constants
69		--
70		-- Options that may be set at "match time" are [[:ANCHORED]], [[:NEWLINE_CR]], [[:NEWLINE_LF]],
71		-- [[:NEWLINE_CRLF]], [[:NEWLINE_ANY]] [[:NEWLINE_ANYCRLF]] [[:NOTBOL]], [[:NOTEOL]],
72		-- [[:NOTEMPTY]], [[:NO_UTF8_CHECK]]. Routines that take match time option constants match,
73		-- split or replace a regular expression against some string.
74		--
75		--
76
77		--****
78		-- Signature:
79		-- public constant ANCHORED
80		--
81		-- Description:
82		-- Forces matches to be only from the first place it is asked to
83		-- try to make a search.
84		-- In C, this is called PCRE_ANCHORED.
85		-- This is passed to all routines including [[:new]].
86
87		--****
88		-- Signature:
89		-- public constant AUTO_CALLOUT
90		--
91		-- Description:
92		-- In C, this is called PCRE_AUTO_CALLOUT.
93		-- To get the functionality of this flag in EUPHORIA, you can use:
94		-- [[:find_replace_callback]] without passing this option.
95		-- This is passed to [[:new]].
96
97		--****
98		-- Signature:
99		-- public constant BSR_ANYCRLF
100		--
101		-- Description:
102		-- With this option only ASCII new line sequences are recognized as newlines. Other UNICODE
103		-- newline sequences (encoded as UTF8) are not recognized as an end of line marker.
104		-- This is passed to all routines including [[:new]].
105
106		--****
107		-- Signature:
108		-- public constant BSR_UNICODE
109		--
110		-- Description:
111		-- With this option any UNICODE new line sequence is recognized as a newline.
112		-- The UNICODE will have to be encoded as UTF8, however.
113		-- This is passed to all routines including [[:new]].
114
115		--****
116		-- Signature:
117		-- public constant CASELESS
118		--
119		-- Description:
120		-- This will make your regular expression matches case insensitive. With this
121		-- flag for example, [a-z] is the same as [A-Za-z].
122		-- This is passed to [[:new]].
123
124		--****
125		-- Signature:
126		-- public constant DEFAULT
127		--
128		-- Description:
129		-- This is a value used for not setting any flags at all. This can be passed to
130		-- all routines including [[:new]]
131
132		--****
133		-- Signature:
134		-- public constant DFA_SHORTEST
135		--
136		-- Description:
137		-- This is NOT used by any standard library routine.
138
139		--****
140		-- Signature:
141		-- public constant DFA_RESTART
142		--
143		-- Description:
144		-- This is NOT used by any standard library routine.
145
146		--****
147		-- Signature:
148		-- public constant DOLLAR_ENDONLY
149		--
150		-- Description:
151		-- If this bit is set, a dollar sign metacharacter in the pattern matches only
152		-- at the end of the subject string. Without this option, a dollar sign also
153		-- matches immediately before a newline at the end of the string (but not
154		-- before any other newlines). Thus you must include the newline character
155		-- in the pattern before the dollar sign if you want to match a line that contanis
156		-- a newline character.
157		-- The DOLLAR_ENDONLY option is ignored if MULTILINE is set.
158		-- There is no way to set this option within a pattern.
159		-- This is passed to [[:new]].
160
161		--****
162		-- Signature:
163		-- public constant DOTALL
164		--
165		-- Description:
166		-- With this option the '.' character also matches a newline sequence.
167		-- This is passed to [[:new]].
168
169		--****
170		-- Signature:
171		-- public constant DUPNAMES
172		--
173		-- Description:
174		-- Allow duplicate names for named subpatterns.
175		-- Since there is no way to access named subpatterns this flag has no effect.
176		-- This is passed to [[:new]].
177
178		--****
179		-- Signature:
180		-- public constant EXTENDED
181		--
182		-- Description:
183		-- Whitespace and characters beginning with a hash mark to the end of the line
184		-- in the pattern will be ignored when searching except when the whitespace or hash
185		-- is escaped or in a character class.
186		-- This is passed to [[:new]].
187
188		--****
189		-- Signature:
190		-- public constant EXTRA
191		--
192		-- Description:
193		-- When an alphanumeric follows a backslash(\) has no special meaning an
194		-- error is generated.
195		-- This is passed to [[:new]].
196
197		--****
198		-- Signature:
199		-- public constant FIRSTLINE
200		--
201		-- Description:
202		-- If PCRE_FIRSTLINE is set, the match must happen before or at the first
203		-- newline in the subject (though it may continue over the newline).
204		-- This is passed to [[:new]].
205
206		--****
207		-- Signature:
208		-- public constant MULTILINE
209		--
210		-- Description:
211		-- When MULTILINE it is set, the "start of line" and "end of line"
212		-- constructs match immediately following or immediately before internal
213		-- newlines in the subject string, respectively, as well as at the very
214		-- start and end. This is passed to [[:new]].
215
216		--****
217		-- Signature:
218		-- public constant NEWLINE_CR
219		--
220		-- Description:
221		-- Sets CR as the NEWLINE sequence.
222		-- The NEWLINE sequence will match $
223		-- when MULTILINE is set.
224		-- This is passed to all routines including [[:new]].
225
226		--****
227		-- Signature:
228		-- public constant NEWLINE_LF
229		--
230		-- Description:
231		-- Sets LF as the NEWLINE sequence.
232		-- The NEWLINE sequence will match $
233		-- when MULTILINE is set.
234		-- This is passed to all routines including [[:new]].
235
236		--****
237		-- Signature:
238		-- public constant NEWLINE_CRLF
239		--
240		-- Description:
241		-- Sets CRLF as the NEWLINE sequence
242		-- The NEWLINE sequence will match $
243		-- when MULTILINE is set.
244		-- This is passed to all routines including [[:new]].
245
246		--****
247		-- Signature:
248		-- public constant NEWLINE_ANY
249		--
250		-- Description:
251		-- Sets ANY newline sequence as the NEWLINE sequence including
252		-- those from UNICODE when UTF8 is also set. The string will have
253		-- to be encoded as UTF8, however.
254		-- The NEWLINE sequence will match $
255		-- when MULTILINE is set.
256		-- This is passed to all routines including [[:new]].
257
258		--****
259		-- Signature:
260		-- public constant NEWLINE_ANYCRLF
261		--
262		-- Description:
263		-- Sets ANY newline sequence from ASCII.
264		-- The NEWLINE sequence will match $
265		-- when MULTILINE is set.
266		-- This is passed to all routines including [[:new]].
267
268		--****
269		-- Signature:
270		-- public constant NOTBOL
271		--
272		-- Description:
273		-- This indicates that beginning of the passed string does NOT start
274		-- at the Beginning Of a Line (NOTBOL), so a carrot symbol (^) in the
275		-- original pattern will not match the beginning of the string.
276		-- This is used by routines other than [[:new]].
277
278		--****
279		-- Signature:
280		-- public constant NOTEOL
281		--
282		-- Description:
283		-- This indicates that end of the passed string does NOT end
284		-- at the End Of a Line (NOTEOL), so a dollar sign ($) in the
285		-- original pattern will not match the end of the string.
286		-- This is used by routines other than [[:new]].
287
288		--****
289		-- Signature:
290		-- public constant NO_AUTO_CAPTURE
291		--
292		-- Description:
293		-- Disables capturing subpatterns except when the subpatterns are
294		-- named.
295		-- This is passed to [[:new]].
296
297		--****
298		-- Signature:
299		-- public constant NO_UTF8_CHECK
300		--
301		-- Description:
302		-- Turn off checking for the validity of your UTF string. Use this
303		-- with caution. An invalid utf8 string with this option could crash
304		-- your program. Only use this if you know the string is a valid utf8 string.
305		-- See [[:unicode:validate]].
306		-- This is passed to all routines including [[:new]].
307
308		--****
309		-- Signature:
310		-- public constant NOTEMPTY
311		--
312		-- Description:
313		-- Here matches of empty strings will not be allowed. In C, this is PCRE_NOTEMPTY.
314		-- The pattern: `Aa` will match "AAAA", "aaaa", and "Aaaa" but not "".
315		-- This is used by routines other than [[:new]].
316
317		--****
318		-- Signature:
319		-- public constant PARTIAL
320		--
321		-- Description:
322		-- This option has no effect with these routines. Refer to the C documentation
323		-- for what it does in C.
324		-- In C, this constant is called PCRE_PARTIAL.
325		-- This is used by routines other than [[:new]].
326
327		--****
328		-- Signature:
329		-- public constant STRING_OFFSETS
330		--
331		-- Description:
332		-- This is used by [[:matches]] and [[:all_matches]].
333
334		--****
335		-- Signature:
336		-- public constant UNGREEDY
337		-- This modifier sets the pattern such that quantifiers are
338		-- not greedy by default, but become greedy if followed by a question mark.
339		--
340		-- Description:
341		-- This is passed to [[:new]].
342
343		--****
344		-- Signature:
345		-- public constant UTF8
346		--
347		-- Description:
348		-- Makes strings passed in to be interpreted as a UTF8 encoded string.
349		-- This is passed to [[:new]].
350
351		public constant
352	6	DEFAULT = #00000000,
353	6	CASELESS = #00000001,
354	6	MULTILINE = #00000002,
355	6	DOTALL = #00000004,
356	6	EXTENDED = #00000008,
357	6	ANCHORED = #00000010,
358	6	DOLLAR_ENDONLY = #00000020,
359	6	EXTRA = #00000040,
360	6	NOTBOL = #00000080,
361	6	NOTEOL = #00000100,
362	6	UNGREEDY = #00000200,
363	6	NOTEMPTY = #00000400,
364	6	UTF8 = #00000800,
365	6	NO_AUTO_CAPTURE = #00001000,
366	6	NO_UTF8_CHECK = #00002000,
367	6	AUTO_CALLOUT = #00004000,
368	6	PARTIAL = #00008000,
369	6	DFA_SHORTEST = #00010000,
370	6	DFA_RESTART = #00020000,
371	6	FIRSTLINE = #00040000,
372	6	DUPNAMES = #00080000,
373	6	NEWLINE_CR = #00100000,
374	6	NEWLINE_LF = #00200000,
375	6	NEWLINE_CRLF = #00300000,
376	6	NEWLINE_ANY = #00400000,
377	6	NEWLINE_ANYCRLF = #00500000,
378	6	BSR_ANYCRLF = #00800000,
379	6	BSR_UNICODE = #01000000,
380	6	STRING_OFFSETS = #0C000000
381
382	6	constant option_names = {
383		{ DEFAULT, "DEFAULT" },
384		{ CASELESS, "CASELESS" },
385		{ MULTILINE, "MULTILINE" },
386		{ DOTALL, "DOTALL" },
387		{ EXTENDED, "EXTENDED" },
388		{ ANCHORED, "ANCHORED" },
389		{ DOLLAR_ENDONLY, "DOLLAR_ENDONLY" },
390		{ EXTRA, "EXTRA" },
391		{ NOTBOL, "NOTBOL" },
392		{ NOTEOL, "NOTEOL" },
393		{ UNGREEDY, "UNGREEDY" },
394		{ NOTEMPTY, "NOTEMPTY" },
395		{ UTF8, "UTF8" },
396		{ NO_AUTO_CAPTURE, "NO_AUTO_CAPTURE" },
397		{ NO_UTF8_CHECK, "NO_UTF8_CHECK" },
398		{ AUTO_CALLOUT, "AUTO_CALLOUT" },
399		{ PARTIAL, "PARTIAL" },
400		{ DFA_SHORTEST, "DFA_SHORTEST" },
401		{ DFA_RESTART, "DFA_RESTART" },
402		{ FIRSTLINE, "FIRSTLINE" },
403		{ DUPNAMES, "DUPNAMES" },
404		{ NEWLINE_CR, "NEWLINE_CR" },
405		{ NEWLINE_LF, "NEWLINE_LF" },
406		{ NEWLINE_CRLF, "NEWLINE_CRLF" },
407		{ NEWLINE_ANY, "NEWLINE_ANY" },
408		{ NEWLINE_ANYCRLF, "NEWLINE_ANYCRLF" },
409		{ BSR_ANYCRLF, "BSR_ANYCRLF" },
410		{ BSR_UNICODE, "BSR_UNICODE" },
411		{ STRING_OFFSETS, "STRING_OFFSETS" }
412		}
413
414		--****
415		-- === Error Constants
416
417		public constant
418	6	ERROR_NOMATCH = (-1),
419	6	ERROR_NULL = (-2),
420	6	ERROR_BADOPTION = (-3),
421	6	ERROR_BADMAGIC = (-4),
422	6	ERROR_UNKNOWN_OPCODE = (-5),
423	6	ERROR_UNKNOWN_NODE = (-5),
424	6	ERROR_NOMEMORY = (-6),
425	6	ERROR_NOSUBSTRING = (-7),
426	6	ERROR_MATCHLIMIT = (-8),
427	6	ERROR_CALLOUT = (-9),
428	6	ERROR_BADUTF8 = (-10),
429	6	ERROR_BADUTF8_OFFSET = (-11),
430	6	ERROR_PARTIAL = (-12),
431	6	ERROR_BADPARTIAL = (-13),
432	6	ERROR_INTERNAL = (-14),
433	6	ERROR_BADCOUNT = (-15),
434	6	ERROR_DFA_UITEM = (-16),
435	6	ERROR_DFA_UCOND = (-17),
436	6	ERROR_DFA_UMLIMIT = (-18),
437	6	ERROR_DFA_WSSIZE = (-19),
438	6	ERROR_DFA_RECURSE = (-20),
439	6	ERROR_RECURSIONLIMIT = (-21),
440	6	ERROR_NULLWSLIMIT = (-22),
441	6	ERROR_BADNEWLINE = (-23)
442
443	6	public constant error_names = {
444		{ERROR_NOMATCH ,"ERROR_NOMATCH"},
445		{ERROR_NULL ,"ERROR_NULL"},
446		{ERROR_BADOPTION ,"ERROR_BADOPTION"},
447		{ERROR_BADMAGIC ,"ERROR_BADMAGIC"},
448		{ERROR_UNKNOWN_OPCODE ,"ERROR_UNKNOWN_OPCODE/NODE"},
449		{ERROR_UNKNOWN_NODE ,"ERROR_UNKNOWN_OPCODE/NODE"},
450		{ERROR_NOMEMORY ,"ERROR_NOMEMORY"},
451		{ERROR_NOSUBSTRING ,"ERROR_NOSUBSTRING"},
452		{ERROR_MATCHLIMIT ,"ERROR_MATCHLIMIT"},
453		{ERROR_CALLOUT ,"ERROR_CALLOUT"},
454		{ERROR_BADUTF8 ,"ERROR_BADUTF8"},
455		{ERROR_BADUTF8_OFFSET ,"ERROR_BADUTF8_OFFSET"},
456		{ERROR_PARTIAL ,"ERROR_PARTIAL"},
457		{ERROR_BADPARTIAL ,"ERROR_BADPARTIAL"},
458		{ERROR_INTERNAL ,"ERROR_INTERNAL"},
459		{ERROR_BADCOUNT ,"ERROR_BADCOUNT"},
460		{ERROR_DFA_UITEM ,"ERROR_DFA_UITEM"},
461		{ERROR_DFA_UCOND ,"ERROR_DFA_UCOND"},
462		{ERROR_DFA_UMLIMIT ,"ERROR_DFA_UMLIMIT"},
463		{ERROR_DFA_WSSIZE ,"ERROR_DFA_WSSIZE"},
464		{ERROR_DFA_RECURSE ,"ERROR_DFA_RECURSE"},
465		{ERROR_RECURSIONLIMIT ,"ERROR_RECURSIONLIMIT"},
466		{ERROR_NULLWSLIMIT ,"ERROR_NULLWSLIMIT"},
467		{ERROR_BADNEWLINE ,"ERROR_BADNEWLINE"}
468		}
469
470	6	constant all_options = or_all({
471		DEFAULT ,
472		CASELESS ,
473		MULTILINE ,
474		DOTALL ,
475		EXTENDED ,
476		ANCHORED ,
477		DOLLAR_ENDONLY ,
478		EXTRA ,
479		NOTBOL ,
480		NOTEOL ,
481		UNGREEDY ,
482		NOTEMPTY ,
483		UTF8 ,
484		NO_AUTO_CAPTURE ,
485		NO_UTF8_CHECK ,
486		AUTO_CALLOUT ,
487		PARTIAL ,
488		DFA_SHORTEST ,
489		DFA_RESTART ,
490		FIRSTLINE ,
491		DUPNAMES ,
492		NEWLINE_CR ,
493		NEWLINE_LF ,
494		NEWLINE_CRLF ,
495		NEWLINE_ANY ,
496		NEWLINE_ANYCRLF ,
497		BSR_ANYCRLF ,
498		BSR_UNICODE ,
499		STRING_OFFSETS})
500
501
502		--****
503		-- === Create/Destroy
504
505		--**
506		-- Regular expression type
507
508	242	public type regex(object o)
509	242	return sequence(o)
510		end type
511
512		--**
513		-- Regular expression option specification type
514		--
515		-- Although the functions do not use this type (they return an error instead),
516		-- you can use this to check if your routine is receiving something sane.
517	207	public type option_spec(object o)
518	207	if atom(o) then
519	201	if not integer(o) then
520	0	return 0
521		else
522	201	if (or_bits(o,all_options) != all_options) then
523	0	return 0
524		else
525	201	return 1
526		end if
527		end if
528	6	elsif integer_array(o) then
529	6	return option_spec(or_all(o))
530		else
531	0	return 0
532		end if
533		end type
534
535		--**
536		-- Converts an option spec to a string.
537		--
538		-- This can be useful for debugging what options were passed in.
539		-- Without it you have to convert a number to hex and lookup the
540		-- constants in the source code.
541	1	public function option_spec_to_string(option_spec o)
542	1	return flags:flags_to_string(o, option_names)
543		end function
544
545		--**
546		-- Converts an regex error to a string.
547		--
548		-- This can be useful for debugging and even something rough to give to
549		-- the user incase of a regex failure. It's preferable to
550		-- a number.
551		--
552		-- See Also:
553		-- [[:error_message]]
554	26	public function error_to_string(integer i)
555	26	if i >= 0 or i < -23 then
556	2	return sprintf("%d",{i})
557		else
558	24	return vlookup(i, error_names, 1, 2, "Unknown Error")
559		end if
560		end function
561
562		--**
563		-- Return an allocated regular expression
564		--
565		-- Parameters:
566		-- # ##pattern## : a sequence representing a human readable regular expression
567		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Compile Time Option Constants]].
568		--
569		-- Returns:
570		-- A regex, which other regular expression routines can work on or an atom to indicate an
571		-- error. If an error, you can call [[:error_message]] to get a detailed error message.
572		--
573		-- Comments:
574		-- This is the only routine that accepts a human readable regular expression. The string is
575		-- compiled and a [[:regex]] is returned. Analyzing and compiling a regular expression is a
576		-- costly operation and should not be done more than necessary. For instance, if your application
577		-- looks for an email address among text frequently, you should create the regular expression
578		-- as a constant accessible to your source code and any files that may use it, thus, the regular
579		-- expression is analyzed and compiled only once per run of your application.
580		--
581		--
582		-- -- Bad Example
583		-- include std/regex.e as re
584		--
585		-- while sequence(line) do
586		-- re:regex proper_name = re:new("[A-Z][a-z]+ [A-Z][a-z]+")
587		-- if re:find(proper_name, line) then
588		-- -- code
589		-- end if
590		-- end while
591		--
592		--
593		--
594		-- -- Good Example
595		-- include std/regex.e as re
596		-- constant re_proper_name = re:new("[A-Z][a-z]+ [A-Z][a-z]+")
597		-- while sequence(line) do
598		-- if re:find(re_proper_name, line) then
599		-- -- code
600		-- end if
601		-- end while
602		--
603		--
604		-- Example 1:
605		--
606		-- include std/regex.e as re
607		-- re:regex number = re:new("[0-9]+")
608		--
609		--
610		-- Note:
611		-- For simple matches, the built-in Euphoria
612		-- routine [[:eu:match]] and the library routine [[:wildcard:is_match]]
613		-- are often times easier to use and
614		-- a little faster. Regular expressions are faster for complex searching/matching.
615		--
616		-- See Also:
617		-- [[:error_message]], [[:find]], [[:find_all]]
618
619	47	public function new(string pattern, option_spec options=DEFAULT)
620	47	if sequence(options) then options = or_all(options) end if
621
622		-- concatenation ensures we really get a new sequence, and don't just use the
623		-- one passed in, which could be another regex previously created...this may
624		-- be a bug with the refcount/delete_instance/regex code
625	47	return machine_func(M_PCRE_COMPILE, { pattern, options })
626		end function
627
628		--**
629		-- If ##[[:new]]## returns an atom, this function will return a text error message
630		-- as to the reason.
631		--
632		-- Parameters:
633		-- # ##re##: Regular expression to get the error message from
634		--
635		-- Returns:
636		-- An atom (0) when no error message exists, otherwise a sequence describing the error.
637		--
638		-- Example 1:
639		--
640		-- include std/regex.e
641		-- object r = regex:new("[A-Z[a-z]*")
642		-- if atom(r) then
643		-- printf(1, "Regex failed to compile: %s\n", { regex:error_message(r) })
644		-- end if
645		--
646		--
647
648	2	public function error_message(object re)
649	2	return machine_func(M_PCRE_ERROR_MESSAGE, { re })
650		end function
651
652		--****
653		-- === Utility Routines
654		--
655
656		--**
657		-- Escape special regular expression characters that may be entered into a search
658		-- string from user input.
659		--
660		-- Notes:
661		-- Special regex characters are:
662		-- {{{
663		-- . \ + * ? [ ^ ] $ ( ) { } = ! < > \| : -
664		-- }}}
665		--
666		-- Parameters:
667		-- # ##s##: string sequence to escape
668		--
669		-- Returns:
670		-- An escaped ##sequence## representing ##s##.
671		--
672		-- Example 1:
673		--
674		-- include std/regex.e as re
675		-- sequence search_s = re:escape("Payroll is $***15.00")
676		-- -- search_s = "Payroll is \\$\\\\\\*15\\.00"
677		--
678		--
679
680	1	public function escape(string s)
681	1	return text:escape(s, ".\\+*?[^]$(){}=!<>\|:-")
682		end function
683
684		--**
685		-- Returns the number of capturing subpatterns (the ovector size) for a regex
686		--
687		-- Parameters:
688		-- # ##ex## : a regex
689		-- # ##maxsize## : optional maximum number of named groups to get data from
690		--
691		-- Returns:
692		-- An integer
693		--
694
695	85	public function get_ovector_size(regex ex, integer maxsize=0)
696
697	85	integer m = machine_func(M_PCRE_GET_OVECTOR_SIZE, {ex})
698	85	if (m > maxsize) then
699	0	return maxsize
700		end if
701	85	return m+1
702		end function
703
704		--****
705		-- === Match
706
707		--**
708		-- Return the first match of ##re## in ##haystack##. You can optionally start at the position
709		-- ##from##.
710		--
711		-- Parameters:
712		-- # ##re## : a regex for a subject to be matched against
713		-- # ##haystack## : a string in which to searched
714		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
715		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
716		-- The only options that
717		-- may be set when calling find are [[:ANCHORED]], [[:NEWLINE_CR]], [[:NEWLINE_LF]],
718		-- [[:NEWLINE_CRLF]], [[:NEWLINE_ANY]] [[:NEWLINE_ANYCRLF]] [[:NOTBOL]], [[:NOTEOL]],
719		-- [[:NOTEMPTY]], and [[:NO_UTF8_CHECK]].
720		-- ##options## can be any match time option or a
721		-- sequence of valid options or it can be a value that comes from using or_bits on
722		-- any two valid option values.
723		-- # ##size## : internal (how large an array the C backend should allocate). Defaults to 90, in rare cases this number may need to be increased in order to accomodate complex regex expressions.
724		--
725		-- Returns:
726		-- An object, which is either an atom of 0, meaning nothing matched or a sequence of matched pairs.
727		-- For the explanation of the returned sequence, please see the first example.
728		--
729		-- Example 1:
730		--
731		-- include std/regex.e as re
732		-- r = re:new("([A-Za-z]+) ([0-9]+)") -- John 20 or Jane 45
733		-- object result = re:find(r, "John 20")
734		--
735		-- -- The return value will be:
736		-- -- {
737		-- -- { 1, 7 }, -- Total match
738		-- -- { 1, 4 }, -- First grouping "John" ([A-Za-z]+)
739		-- -- { 6, 7 } -- Second grouping "20" ([0-9]+)
740		-- -- }
741		--
742		--
743
744	85	public function find(regex re, string haystack, integer from=1, option_spec options=DEFAULT, integer size = get_ovector_size(re, 30))
745	85	if sequence(options) then options = or_all(options) end if
746	85	if size < 0 then
747	0	size = 0
748		end if
749
750	85	return machine_func(M_PCRE_EXEC, { re, haystack, options, from, size })
751		end function
752
753		--**
754		-- Return all matches of ##re## in ##haystack## optionally starting at the sequence position
755		-- ##from##.
756		--
757		-- Parameters:
758		-- # ##re## : a regex for a subject to be matched against
759		-- # ##haystack## : a string in which to searched
760		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
761		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
762		--
763		-- Returns:
764		-- A sequence of sequences that were returned by [[:find]] and in the case of
765		-- no matches this returns an empty sequence.
766		-- Please see [[:find]] for a detailed description of each member of the return
767		-- sequence.
768		--
769		-- Example 1:
770		--
771		-- include std/regex.e as re
772		-- constant re_number = re:new("[0-9]+")
773		-- object matches = re:find_all(re_number, "10 20 30")
774		--
775		-- -- matches is:
776		-- -- {
777		-- -- {{1, 2}},
778		-- -- {{4, 5}},
779		-- -- {{7, 8}}
780		-- -- }
781		--
782		--
783
784	16	public function find_all(regex re, string haystack, integer from=1, option_spec options=DEFAULT)
785	16	if sequence(options) then options = or_all(options) end if
786
787	16	object result
788	16	sequence results = {}
789	16	while sequence(result) with entry do
790	27	results = append(results, result)
791	27	from = max(result) + 1
792
793	27	if from > length(haystack) then
794	10	exit
795		end if
796		entry
797	33	result = find(re, haystack, from, options)
798	33	end while
799
800	16	return results
801		end function
802
803		--**
804		-- Determine if ##re## matches any portion of ##haystack##.
805		--
806		-- Parameters:
807		-- # ##re## : a regex for a subject to be matched against
808		-- # ##haystack## : a string in which to searched
809		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
810		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
811		-- ##options## can be any match time option or a
812		-- sequence of valid options or it can be a value that comes from using or_bits on
813		-- any two valid option values.
814		--
815		-- Returns:
816		-- An atom, 1 if ##re## matches any portion of ##haystack## or 0 if not.
817		--
818
819	2	public function has_match(regex re, string haystack, integer from=1, option_spec options=DEFAULT)
820	2	return sequence(find(re, haystack, from, options))
821		end function
822
823		--**
824		-- Determine if the entire ##haystack## matches ##re##.
825		--
826		-- Parameters:
827		-- # ##re## : a regex for a subject to be matched against
828		-- # ##haystack## : a string in which to searched
829		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
830		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
831		-- ##options## can be any match time option or a
832		-- sequence of valid options or it can be a value that comes from using or_bits on
833		-- any two valid option values.
834		--
835		-- Returns:
836		-- An atom, 1 if ##re## matches the entire ##haystack## or 0 if not.
837		--
838
839	15	public function is_match(regex re, string haystack, integer from=1, option_spec options=DEFAULT)
840	15	object m = find(re, haystack, from, options)
841
842	15	if sequence(m) and length(m) > 0 and m[1][1] = 1 and m[1][2] = length(haystack) then
843	9	return 1
844		end if
845
846	6	return 0
847		end function
848
849		--**
850		-- Get the matched text only.
851		--
852		-- Parameters:
853		-- # ##re## : a regex for a subject to be matched against
854		-- # ##haystack## : a string in which to searched
855		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
856		-- # ##options## : defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
857		-- ##options## can be any match time option or STRING_OFFSETS or a
858		-- sequence of valid options or it can be a value that comes from using or_bits on
859		-- any two valid option values.
860		--
861		-- Returns:
862		-- Returns a sequence of strings, the first being the entire match and subsequent
863		-- items being each of the captured groups or ERROR_NOMATCH of there is no match.
864		-- The size of the sequence is the number
865		-- of groups in the expression plus one (for the entire match).
866		--
867		-- If ##options## contains the bit [[:STRING_OFFSETS]], then the result is different.
868		-- For each item, a sequence is returned containing the matched text, the starting
869		-- index in ##haystack## and the ending index in ##haystack##.
870		--
871		-- Example 1:
872		--
873		-- include std/regex.e as re
874		-- constant re_name = re:new("([A-Z][a-z]+) ([A-Z][a-z]+)")
875		--
876		-- object matches = re:matches(re_name, "John Doe and Jane Doe")
877		-- -- matches is:
878		-- -- {
879		-- -- "John Doe", -- full match data
880		-- -- "John", -- first group
881		-- -- "Doe" -- second group
882		-- -- }
883		--
884		-- matches = re:matches(re_name, "John Doe and Jane Doe", re:STRING_OFFSETS)
885		-- -- matches is:
886		-- -- {
887		-- -- { "John Doe", 1, 8 }, -- full match data
888		-- -- { "John", 1, 4 }, -- first group
889		-- -- { "Doe", 6, 8 } -- second group
890		-- -- }
891		--
892		--
893		-- See Also:
894		-- [[:all_matches]]
895		--
896	9	public function matches(regex re, string haystack, integer from=1, option_spec options=DEFAULT)
897	9	if sequence(options) then options = or_all(options) end if
898	9	integer str_offsets = and_bits(STRING_OFFSETS, options)
899	9	object match_data = find(re, haystack, from, and_bits(options, not_bits(STRING_OFFSETS)))
900
901	9	if atom(match_data) then return ERROR_NOMATCH end if
902
903	5	for i = 1 to length(match_data) do
904	19	sequence tmp
905	19	if match_data[i][1] = 0 then
906	0	tmp = ""
907		else
908	19	tmp = haystack[match_data[i][1]..match_data[i][2]]
909		end if
910	19	if str_offsets then
911	3	match_data[i] = { tmp, match_data[i][1], match_data[i][2] }
912		else
913	16	match_data[i] = tmp
914		end if
915	19	end for
916
917	5	return match_data
918		end function
919
920		--**
921		-- Get the text of all matches
922		--
923		-- Parameters:
924		-- # ##re## : a regex for a subject to be matched against
925		-- # ##haystack## : a string in which to searched
926		-- # ##from## : an integer setting the starting position to begin searching from. Defaults to 1
927		-- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
928		-- ##options## can be any match time option or a
929		-- sequence of valid options or it can be a value that comes from using or_bits on
930		-- any two valid option values.
931		--
932		-- Returns:
933		-- Returns ERROR_NOMATCH if there are no matches, or a sequence of sequences of
934		-- strings if there is at least one match. In each member sequence of the returned sequence,
935		-- the first string is the entire match and subsequent items being each of the
936		-- captured groups. The size of the sequence is
937		-- the number of groups in the expression plus one (for the entire match). In other words,
938		-- each member of the return value will be of the same structure of that is returned by
939		-- [[:matches]].
940		--
941		-- If ##options## contains the bit [[:STRING_OFFSETS]], then the result is different.
942		-- In each member sequence, instead of each member being a string each member is itself a sequence
943		-- containing the matched text, the starting index in ##haystack## and the ending
944		-- index in ##haystack##.
945		--
946		-- Example 1:
947		--
948		-- include std/regex.e as re
949		-- constant re_name = re:new("([A-Z][a-z]+) ([A-Z][a-z]+)")
950		--
951		-- object matches = re:match_all(re_name, "John Doe and Jane Doe")
952		-- -- matches is:
953		-- -- {
954		-- -- { -- first match
955		-- -- "John Doe", -- full match data
956		-- -- "John", -- first group
957		-- -- "Doe" -- second group
958		-- -- },
959		-- -- { -- second match
960		-- -- "Jane Doe", -- full match data
961		-- -- "Jane", -- first group
962		-- -- "Doe" -- second group
963		-- -- }
964		-- -- }
965		--
966		-- matches = re:match_all(re_name, "John Doe and Jane Doe", re:STRING_OFFSETS)
967		-- -- matches is:
968		-- -- {
969		-- -- { -- first match
970		-- -- { "John Doe", 1, 8 }, -- full match data
971		-- -- { "John", 1, 4 }, -- first group
972		-- -- { "Doe", 6, 8 } -- second group
973		-- -- },
974		-- -- { -- second match
975		-- -- { "Jane Doe", 14, 21 }, -- full match data
976		-- -- { "Jane", 14, 17 }, -- first group
977		-- -- { "Doe", 19, 21 } -- second group
978		-- -- }
979		-- -- }
980		--
981		--
982		-- See Also:
983		-- [[:matches]]
984
985	3	public function all_matches(regex re, string haystack, integer from=1, option_spec options=DEFAULT)
986	3	if sequence(options) then options = or_all(options) end if
987	3	integer str_offsets = and_bits(STRING_OFFSETS, options)
988	3	object match_data = find_all(re, haystack, from, and_bits(options, not_bits(STRING_OFFSETS)))
989
990	3	if length(match_data) = 0 then return ERROR_NOMATCH end if
991
992	2	for i = 1 to length(match_data) do
993	4	for j = 1 to length(match_data[i]) do
994	12	sequence tmp = haystack[match_data[i][j][1]..match_data[i][j][2]]
995	12	if str_offsets then
996	6	match_data[i][j] = { tmp, match_data[i][j][1], match_data[i][j][2] }
997		else
998	6	match_data[i][j] = tmp
999		end if
1000	12	end for
1001	4	end for
1002
1003	2	return match_data
1004		end function
1005
1006		--****
1007		-- === Splitting
1008
1009		--**
1010		-- Split a string based on a regex as a delimiter
1011		--
1012		-- Parameters:
1013		-- # ##re## : a regex which will be used for matching
1014		-- # ##text## : a string on which search and replace will apply
1015		-- # ##from## : optional start position
1016		-- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
1017		-- ##options## can be any match time option or a
1018		-- sequence of valid options or it can be a value that comes from using or_bits on
1019		-- any two valid option values.
1020		--
1021		-- Returns:
1022		-- A sequence of string values split at the delimiter and if no delimiters were matched
1023		-- this sequence will be a one member sequence equal to ##{text}##.
1024		--
1025		-- Example 1:
1026		--
1027		-- include std/regex.e as re
1028		-- regex comma_space_re = re:new(`,\s`)
1029		-- sequence data = re:split(comma_space_re, "euphoria programming, source code, reference data")
1030		-- -- data is
1031		-- -- {
1032		-- -- "euphoria programming",
1033		-- -- "source code",
1034		-- -- "reference data"
1035		-- -- }
1036		--
1037		--
1038
1039	1	public function split(regex re, string text, integer from=1, option_spec options=DEFAULT)
1040	1	return split_limit(re, text, 0, from, options)
1041		end function
1042
1043	2	public function split_limit(regex re, string text, integer limit=0, integer from=1, option_spec options=DEFAULT)
1044	2	if sequence(options) then options = or_all(options) end if
1045	2	sequence match_data = find_all(re, text, from, options), result
1046	2	integer last = 1
1047
1048	2	if limit = 0 or limit > length(match_data) then
1049	1	limit = length(match_data)
1050		end if
1051
1052	2	result = repeat(0, limit)
1053
1054	2	for i = 1 to limit do
1055	3	result[i] = text[last..match_data[i][1][1] - 1]
1056	3	last = match_data[i][1][2] + 1
1057	3	end for
1058
1059	2	if last < length(text) then
1060	2	result &= { text[last..$] }
1061		end if
1062
1063	2	return result
1064		end function
1065
1066		--****
1067		-- === Replacement
1068		--
1069
1070		--**
1071		-- Replaces all matches of a regex with the replacement text.
1072		--
1073		-- Parameters:
1074		-- # ##re## : a regex which will be used for matching
1075		-- # ##text## : a string on which search and replace will apply
1076		-- # ##replacement## : a string, used to replace each of the full matches
1077		-- # ##from## : optional start position
1078		-- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
1079		-- ##options## can be any match time option or a
1080		-- sequence of valid options or it can be a value that comes from using or_bits on
1081		-- any two valid option values.
1082		--
1083		-- Returns:
1084		-- A sequence, the modified ##text##. If there is no match with ##re## the
1085		-- return value will be the same as ##text## when it was passed in.
1086		--
1087		-- Special replacement operators:
1088		--
1089		-- * ##\## ~-- Causes the next character to lose its special meaning.
1090		-- * ##\n## ~ -- Inserts a 0x0A (LF) character.
1091		-- * ##\r## ~-- Inserts a 0x0D (CR) character.
1092		-- * ##\t## ~-- Inserts a 0x09 (TAB) character.
1093		-- * ##\1## to ##\9## ~-- Recalls stored substrings from registers (\1, \2, \3, to \9).
1094		-- * ##\0## ~-- Recalls entire matched pattern.
1095		-- * ##\u## ~-- Convert next character to uppercase
1096		-- * ##\l## ~-- Convert next character to lowercase
1097		-- * ##\U## ~-- Convert to uppercase till ##\E## or ##\e##
1098		-- * ##\L## ~-- Convert to lowercase till ##\E## or ##\e##
1099		-- * ##\E## or ##\e## ~-- Terminate a ##{{{\\}}}U## or ##\L## conversion
1100		--
1101		-- Example 1:
1102		--
1103		-- include std/regex.e
1104		-- regex r = new(`([A-Za-z]+)\.([A-Za-z]+)`)
1105		-- sequence details = find_replace(r, "hello.txt", `Filename: \U\1\e Extension: \U\2\e`)
1106		-- -- details = "Filename: HELLO Extension: TXT"
1107		--
1108		--
1109
1110	5	public function find_replace(regex ex, string text, sequence replacement, integer from=1,
1111		option_spec options=DEFAULT)
1112	5	return find_replace_limit(ex, text, replacement, -1, from, options)
1113		end function
1114
1115		--**
1116		-- Replaces up to ##limit## matches of ##ex## in ##text## except when ##limit## is 0. When
1117		-- ##limit## is 0, this routine replaces all of the matches.
1118		--
1119		-- This function is identical to [[:find_replace]] except it allows you to limit the number of
1120		-- replacements to perform. Please see the documentation for [[:find_replace]] for all the
1121		-- details.
1122		--
1123		-- Parameters:
1124		-- # ##re## : a regex which will be used for matching
1125		-- # ##text## : a string on which search and replace will apply
1126		-- # ##replacement## : a string, used to replace each of the full matches
1127		-- # ##limit## : the number of matches to process
1128		-- # ##from## : optional start position
1129		-- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
1130		-- ##options## can be any match time option or a
1131		-- sequence of valid options or it can be a value that comes from using or_bits on
1132		-- any two valid option values.
1133		--
1134		-- Returns:
1135		-- A sequence, the modified ##text##.
1136		--
1137		-- See Also:
1138		-- [[:find_replace]]
1139		--
1140
1141	6	public function find_replace_limit(regex ex, string text, sequence replacement,
1142		integer limit, integer from=1, option_spec options=DEFAULT)
1143	6	if sequence(options) then options = or_all(options) end if
1144
1145	6	return machine_func(M_PCRE_REPLACE, { ex, text, replacement, options, from, limit })
1146		end function
1147
1148		--**
1149		-- When ##limit## is positive,
1150		-- this routine replaces up to ##limit## matches of ##ex## in ##text## with the
1151		-- result of the user
1152		-- defined callback, ##rid##, and when ##limit## is 0, replaces
1153		-- all matches of ##ex## in ##text## with the result of this user defined callback, ##rid##.
1154		--
1155		-- The callback should take one sequence. The first member of this sequence will be a
1156		-- a string
1157		-- representing the entire match and the subsequent members, if they exist,
1158		-- will be a strings
1159		-- for the captured groups within the regular expression.
1160		--
1161		-- Parameters:
1162		-- # ##re## : a regex which will be used for matching
1163		-- # ##text## : a string on which search and replace will apply
1164		-- # ##rid## : routine id to execute for each match
1165		-- # ##limit## : the number of matches to process
1166		-- # ##from## : optional start position
1167		-- # ##options## : options, defaults to [[:DEFAULT]]. See [[:Match Time Option Constants]].
1168		-- ##options## can be any match time option or a
1169		-- sequence of valid options or it can be a value that comes from using or_bits on
1170		-- any two valid option values.
1171		--
1172		-- Returns:
1173		-- A sequence, the modified ##text##.
1174		--
1175		-- Example 1:
1176		--
1177		-- include std/regex.e as re
1178		-- function my_convert(sequence params)
1179		-- switch params[1] do
1180		-- case "1" then
1181		-- return "one "
1182		-- case "2" then
1183		-- return "two "
1184		-- case else
1185		-- return "unknown "
1186		-- end switch
1187		-- end function
1188		--
1189		-- regex r = re:new(`\d`)
1190		-- sequence result = re:find_replace_callback(r, "125", routine_id("my_convert"))
1191		-- -- result = "one two unknown "
1192		--
1193		--
1194
1195	3	public function find_replace_callback(regex ex, string text, integer rid, integer limit=0,
1196		integer from=1, option_spec options=DEFAULT)
1197	3	if sequence(options) then options = or_all(options) end if
1198	3	sequence match_data = find_all(ex, text, from, options), replace_data
1199
1200	3	if limit = 0 or limit > length(match_data) then
1201	2	limit = length(match_data)
1202		end if
1203	3	replace_data = repeat(0, limit)
1204
1205	3	for i = 1 to limit do
1206	4	sequence params = repeat(0, length(match_data[i]))
1207	4	for j = 1 to length(match_data[i]) do
1208	12	params[j] = text[match_data[i][j][1]..match_data[i][j][2]]
1209	12	end for
1210
1211	4	replace_data[i] = call_func(rid, { params })
1212	4	end for
1213
1214	3	for i = limit to 1 by -1 do
1215	4	text = replace(text, replace_data[i], match_data[i][1][1], match_data[i][1][2])
1216	4	end for
1217
1218	3	return text
1219		end function