# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use:
http://www.unicode.org/copyright.html
# Copyright (c) 2001-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
#
# ICU regular expression test cases.
#
# format: one test case per line,
# <test case> = <pattern> <flags> <match string> [# comment]
# <pattern> = "<regular expression pattern>"
# <match string> = "<tagged string>"
# the quotes on the pattern and match string can be " or ' or /
# <tagged string> = text, with the start and end of each
# capture group tagged with <n>...</n>. The overall match,
# if any, is group 0, as in <0>matched text</0>
# A region can be specified with <r>...</r> tags.
# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear.
#
# <flags> = any combination of
# i case insensitive match
# x free spacing and comments
# s dot-matches-all mode
# m multi-line mode.
# ($ and ^ match at embedded new-lines)
# D Unix Lines mode (only recognize 0x0a as new-line)
# Q UREGEX_LITERAL flag. Entire pattern is literal string.
# v If icu configured without break iteration, this
# regex test pattern should not compile.
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
# d dump the compiled pattern
# t trace operation of match engine.
# 2-9 a digit between 2 and 9, specifies the number of
# times to execute find(). The expected results are
# for the last find() in the sequence.
# G Only check match / no match. Do not check capture groups.
# E Pattern compilation error expected
# L Use LookingAt() rather than find()
# M Use matches() rather than find().
#
# a Use non-Anchoring Bounds.
# b Use Transparent Bounds.
# The a and b options only make a difference if
# a <r>region</r> has been specified in the string.
# z|Z hitEnd was expected(z) or not expected (Z).
# With neither, hitEnd is not checked.
# y|Y Require End expected(y) or not expected (Y).
#
# White space must be present between the flags and the match string.
#
# Look-ahead expressions
#
"(?!0{5})(\d{5})" "<0><1>00001</1></0>zzzz"
"(?!0{5})(\d{5})z" "<0><1>00001</1>z</0>zzz"
"(?!0{5})(\d{5})(?!y)" "<0><1>00001</1></0>zzzz"
"abc(?=def)" "<0>abc</0>def"
"(.*)(?=c)" "<0><1>ab</1></0>cdef"
"(?:.*)(?=c)" "<r>ab</r>cdef"
"(?:.*)(?=c)" b "<r><0>ab</0></r>cdef" # transparent bounds
"(?:.*)(?=c)" bM "<r><0>ab</0></r>cdef" # transparent bounds
"(?:.*)(?=(c))" b "<0>ab</0><1>c</1>def" # Capture in look-ahead
"(?=(.)\1\1)\1" "abcc<0><1>d</1></0>ddefg" # Backrefs to look-ahead capture
".(?!\p{L})" "abc<0>d</0> " # Negated look-ahead
".(?!(\p{L}))" "abc<0>d</0> " # Negated look-ahead, no capture
# visible outside of look-ahead
"and(?=roid)" L "<0>and</0>roid"
"and(?=roid)" M "<r>and</r>roid"
"and(?=roid)" bM "<r><0>and</0></r>roid"
"and(?!roid)" L "<0>and</0>roix"
"and(?!roid)" L "android"
"and(?!roid)" M "<r><0>and</0></r>roid" # Opaque bounds
"and(?!roid)" bM "<r>and</r>roid"
"and(?!roid)" bM "<r><0>and</0></r>roix"
#
# Negated Lookahead, various regions and region transparency
#
"abc(?!def)" "<0>abc</0>xyz"
"abc(?!def)" "abcdef"
"abc(?!def)" "<r><0>abc</0></r>def"
"abc(?!def)" b "<r>abc</r>def"
"abc(?!def)" b "<r><0>abc</0></r>xyz"
#
# Nested Lookahead / Behind
#
"one(?=(?:(?!<out>).)*</out>)" "<out><0>one</0> stuff</out>"
"one(?=(?:(?!<out>).)*</out>)" "<out>one <out></out>"
# More nesting lookaround: pattern matches "qq" when not preceded by 'a' and followed by 'z'
"(?<!a(?!...z))qq" "<0>qq</0>c"
"(?<!a(?!...z))qq" "f<0>qq</0>c"
"(?<!a(?!...z))qq" "aqqz"
# More nested lookaround: match any two chars preceded and followed by an upper case letter.
# With gratuitous nesting of look-arounds and capture from the look-arounds.
"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "<1>A</1><0>jk</0><2>B</2>"
"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "ajkB"
"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "Ajkb"
# Nested lookaround cases from bug ICU-20564
"(?<=(?<=((?=)){0}+))" "<0></0>abc"
"(?<=c(?<=c((?=c)){1}+))" "c<0><1></1></0>cc"
#
# Anchoring Bounds
#
"^def$" "abc<r><0>def</0></r>ghi" # anchoring (default) bounds
"^def$" a "abc<r>def</r>ghi" # non-anchoring bounds
"^def" a "<r><0>def</0></r>ghi" # non-anchoring bounds
"def$" a "abc<r><0>def</0></r>" # non-anchoring bounds
"^.*$" m "<0>line 1</0>\n line 2"
"^.*$" m2 "line 1\n<0> line 2</0>"
"^.*$" m3 "line 1\n line 2"
"^.*$" m "li<r><0>ne </0></r>1\n line 2" # anchoring bounds
"^.*$" m2 "li<r>ne </r>1\n line 2" # anchoring bounds
"^.*$" am "li<r>ne </r>1\n line 2" # non-anchoring bounds
"^.*$" am "li\n<r><0>ne </0></r>\n1\n line 2" # non-anchoring bounds
#
# HitEnd and RequireEnd for new-lines just before end-of-input
#
"xyz$" yz "<0>xyz</0>\n"
"xyz$" yz "<0>xyz</0>\x{d}\x{a}"
"xyz$" myz "<0>xyz</0>" # multi-line mode
"xyz$" mYZ "<0>xyz</0>\n"
"xyz$" mYZ "<0>xyz</0>\r\n"
"xyz$" mYZ "<0>xyz</0>\x{85}abcd"
"xyz$" Yz "xyz\nx"
"xyz$" Yz "xyza"
"xyz$" yz "<0>xyz</0>"
#
# HitEnd
#
"abcd" Lz "a"
"abcd" Lz "ab"
"abcd" Lz "abc"
"abcd" LZ "<0>abcd</0>"
"abcd" LZ "<0>abcd</0>e"
"abcd" LZ "abcx"
"abcd" LZ "abx"
"abcd" Lzi "a"
"abcd" Lzi "ab"
"abcd" Lzi "abc"
"abcd" LZi "<0>abcd</0>"
"abcd" LZi "<0>abcd</0>e"
"abcd" LZi "abcx"
"abcd" LZi "abx"
#
# All Unicode line endings recognized.
# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
# Multi-line and non-multiline mode take different paths, so repeated tests.
#
"^def$" mYZ "abc\x{a}<0>def</0>\x{a}ghi"
"^def$" mYZ "abc\x{b}<0>def</0>\x{b}ghi"
"^def$" mYZ "abc\x{c}<0>def</0>\x{c}ghi"
"^def$" mYZ "abc\x{d}<0>def</0>\x{d}ghi"
"^def$" mYZ "abc\x{85}<0>def</0>\x{85}ghi"
"^def$" mYZ "abc\x{2028}<0>def</0>\x{2028}ghi"
"^def$" mYZ "abc\x{2029}<0>def</0>\x{2029}ghi"
"^def$" mYZ "abc\r\n<0>def</0>\r\nghi"
"^def$" yz "<0>def</0>\x{a}"
"^def$" yz "<0>def</0>\x{b}"
"^def$" yz "<0>def</0>\x{c}"
"^def$" yz "<0>def</0>\x{d}"
"^def$" yz "<0>def</0>\x{85}"
"^def$" yz "<0>def</0>\x{2028}"
"^def$" yz "<0>def</0>\x{2029}"
"^def$" yz "<0>def</0>\r\n"
"^def$" yz "<0>def</0>"
"^def$" "<0>def</0>\x{2028" #TODO: should be an error of some sort.
#
# UNIX_LINES mode
#
"abc$" D "<0>abc</0>\n"
"abc$" D "abc\r"
"abc$" D "abc\u0085"
"a.b" D "<0>a\rb</0>"
"a.b" D "a\nb"
"(?d)abc$" "<0>abc</0>\n"
"(?d)abc$" "abc\r"
"abc$" mD "<0>abc</0>\ndef"
"abc$" mD "abc\rdef"
".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r
".*def" DL "<0>abc\r def</0> xyz" # Unix Lines mode, \r not line end.
".*def" DL "abc\n def xyz"
"(?d)a.b" "a\nb"
"(?d)a.b" "<0>a\rb</0>"
"^abc" m "xyz\r<0>abc</0>"
"^abc" Dm "xyz\rabc"
"^abc" Dm "xyz\n<0>abc</0>"
# Capturing parens
".(..)." "<0>a<1>bc</1>d</0>"
".*\A( +hello)" "<0><1> hello</1></0>"
"(hello)|(goodbye)" "<0><1>hello</1></0>"
"(hello)|(goodbye)" "<0><2>goodbye</2></0>"
"abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3></3> </2> xyz</1></0> cruft"
"\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d</1><2></2></0> "
"(a|b)c*d" "a<0><1>b</1>cd</0>"
# Non-capturing parens (?: stuff). Groups, but does not capture.
"(?:abc)*(tail)" "<0>abcabcabc<1>tail</1></0>"
# Non-greedy *? quantifier
".*?(abc)" "<0> abx <1>abc</1></0> abc abc abc"
".*(abc)" "<0> abx abc abc abc <1>abc</1></0>"
"((?:abc |xyz )*?)abc " "<0><1>xyz </1>abc </0>abc abc "
"((?:abc |xyz )*)abc " "<0><1>xyz abc abc </1>abc </0>"
# Non-greedy +? quantifier
"(a+?)(a*)" "<0><1>a</1><2>aaaaaaaaaaaa</2></0>"
"(a+)(a*)" "<0><1>aaaaaaaaaaaaa</1><2></2></0>"
"((ab)+?)((ab)*)" "<0><1><2>ab</2></1><3>ababababab<4>ab</4></3></0>"
"((ab)+)((ab)*)" "<0><1>abababababab<2>ab</2></1><3></3></0>"
# Non-greedy ?? quantifier
"(ab)(ab)??(ab)??(ab)??(ab)??c" "<0><1>ab</1><4>ab</4><5>ab</5>c</0>"
# Unicode Properties as naked elements in a pattern
"\p{Lu}+" "here we go ... <0>ABC</0> and no more."
"(\p{L}+)(\P{L}*?) (\p{Zs}*)" "7999<0><1>letters</1><2>4949%^&*(</2> <3> </3></0>"
# \w and \W
"\w+" " $%^&*( <0>hello123</0>%^&*("
"\W+" "<0> $%^&*( </0>hello123%^&*("
# \A match at beginning of input only.
".*\Ahello" "<0>hello</0> hello"
".*hello" "<0>hello hello</0>"
".*\Ahello" "stuff\nhello" # don't match after embedded new-line.
# \b \B
#
".*?\b(.).*" "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>"
"\ba\b" "-<0>a</0>"
"\by\b" "xy"
"[ \b]" "<0>b</0>" # in a set, \b is a literal b.
# Finds first chars of up to 5 words
"(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>T</1>th
e <2>q</2>ick <3>b</3>rown <4>f</4></0>ox"
"H.*?((?:\B.)+)" "<0>H<1>ello</1></0> "
".*?((?:\B.)+).*?((?:\B.)+).*?((?:\B.)+)" "<0>H<1>ello</1> <2> </2>g<3>oodbye</3></0> "
"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A</1>\u0302BC\u0303\u0304<2> </2>\u0305 \u0306<3>X</3>\u0307Y\u0308</0>"
#
# Unicode word boundary mode
#
"(?w).*?\b" v "<0></0>hello, world"
"(?w).*?(\b.+?\b).*" v "<0><1> </1>123.45 </0>"
"(?w).*?(\b\d.*?\b).*" v "<0> <1>123.45</1> </0>"
".*?(\b.+?\b).*" "<0> <1>123</1>.45 </0>"
"(?w:.*?(\b\d.*?\b).*)" v "<0> <1>123.45</1> </0>"
"(?w:.*?(\b.+?\b).*)" v "<0><1>don't</1> </0>"
"(?w:.+?(\b\S.+?\b).*)" v "<0> <1>don't</1> </0>"
"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" v "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7> </0>"
#
# Unicode word boundaries with Regions
#
"(?w).*?\b" v "abc<r><0>def</0></r>ghi"
"(?w).*?\b" v2 "abc<r>def<0></0></r>ghi"
"(?w).*?\b" v3 "abc<r>def</r>ghi"
#"(?w).*?\b" vb "abc<r><0>def</0></r>ghi" # TODO: bug. Ticket 6073
#"(?w).*?\b" vb2 "abc<r>def</r>ghi"
# . does not match new-lines
"." "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X</0>\u000aY"
"A." "A\u000a "# no match
# \d for decimal digits
"\d*" "<0>0123456789\u0660\u06F9\u0969\u0A66\u17E2\uFF10\U0001D7CE\U0001D7FF</0>non-digits"
"\D+" "<0>non digits</0>"
"\D*(\d*)(\D*)" "<0>non-digits<1>3456666</1><2>more non digits</2></0>"
# \Q...\E quote mode
"hel\Qlo, worl\Ed" "<0>hello, world</0>"
"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\</0>\r..." # \Q ... \E escape in a [set]
# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized.
# Note that data strings in test cases still get escape processing.
"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031bye</0>extra"
"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral</0>"
# \S and \s space characters
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"
# \X consume one Grapheme Cluster.
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
# Regional indicator pairs are grapheme clusters
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U0001f1e6\U0001f1e8</1><2>\U0001f1ea\U0001f1ff</2></0>"
# Grapheme Break rule 9b: Prepend x
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U000111C2x</1></0>"
# Grapheme clusters that straddle a match region. Matching is pinned to the region limits,
# giving boundaries inside grapheme clusters
"(\X)?(\X)?(\X)?" v "a\u0301<r><0><1>\u0301\u0301</1><2>z\u0302</2></0></r>\u0302\u0302"
# Same as previous test case, but without the region limits.
"(\X)?(\X)?(\X)?" v "<0><1>a\u0301\u0301\u0301</1><2>z\u0302\u0302\u0302</2></0>"
# ^ matches only at beginning of line
".*^(Hello)" "<0><1>Hello</1></0> Hello Hello Hello Goodbye"
".*(Hello)" "<0>Hello Hello Hello <1>Hello</1></0> Goodbye"
".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match
# $ matches only at end of line, or before a newline preceding the end of line
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)$" z "Hello Goodbye> Goodbye Goodbye "# No Match
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
".*?(Goodbye)$" z "Hello Goodbye Goodbye Goodbye\n\n"# No Match
# \Z matches at end of input, like $ with default flags.
".*?(Goodbye)\Z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)\Z" z "Hello Goodbye> Goodbye Goodbye "# No Match
"here$" z "here\nthe end"# No Match
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
".*?(Goodbye)\Z" "Hello Goodbye Goodbye Goodbye\n\n"# No Match
# \z matches only at the end of string.
# no special treatment of new lines.
# no dependencies on flag settings.
".*?(Goodbye)\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye "# No Match
"here$" z "here\nthe end"# No Match
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye\n"# No Match
".*?(Goodbye)\n\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
"abc\z|def" ZY "abc<0>def</0>"
# (?# comment) doesn't muck up pattern
"Hello (?# this is a comment) world" " <0>Hello world</0>..."
# Check some implementation corner cases base on the way literal strings are compiled.
"A" "<0>A</0>"
"AB" "<0>AB</0>ABABAB"
"AB+" "<0>ABBB</0>A"
"AB+" "<0>AB</0>ABAB"
"ABC+" "<0>ABC</0>ABC"
"ABC+" "<0>ABCCCC</0>ABC"
"(?:ABC)+" "<0>ABCABCABC</0>D"
"(?:ABC)DEF+" "<0>ABCDEFFF</0>D"
"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666E</0>F"
"ab\Bde" "<0>abde</0>"
# loop breaking
"(a?)*" "<0><1></1></0>xyz"
"(a?)+" "<0><1></1></0>xyz"
"^(?:a?b?)*$" "a--"
"(x?)*xyz" "<0>xx<1></1>xyz</0>" # Sligthtly weird, but correct. The "last" time through (x?),
# it matches the empty string.
# Set expressions, basic operators and escapes work
#
"[\d]+" "<0>0123</0>abc/.,"
"[^\d]+" "0123<0>abc/.,</0>"
"[\D]+" "0123<0>abc/.,</0>"
"[^\D]+" "<0>0123</0>abc/.,"
"[\s]+" "<0> \t</0>abc/.,"
"[^\s]+" " \t<0>abc/.,</0>"
"[\S]+" " \t<0>abc/.,</0>"
"[^\S]+" "<0> \t</0>abc/.,"
"[\w]+" "<0>abc123</0> .,;"
"[^\w]+" "abc123<0> .,;</0>"
"[\W]+" "abc123<0> .,;</0>"
"[^\W]+" "<0>abc123</0> .,;"
"[\z]+" "abc<0>zzz</0>def" # \z has no special meaning
"[^\z]+" "<0>abc</0>zzzdef"
"[\^]+" "abc<0>^^</0>"
"[^\^]+" "<0>abc</0>^^"
"[\u0041c]+" "<0>AcAc</0>def"
"[\U00010002]+" "<0>\ud800\udc02</0>\U00010003"
"[^\U00010002]+" "<0>Hello</0>\x{10002}"
"[\x61b]+" "<0>abab</0>cde"
#"[\x6z]+" "\x06" #TODO: single hex digits should fail
"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+" "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3</0>abc"
"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+" "x<0> \u0184\U000E0135 ab</0>c"
"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+" "ab<0>cdef</0>ghi"
#
# [set expressions], check the precedence of '-', '&', '--', '&&'
# '-' and '&', for compatibility with ICU UnicodeSet, have the same
# precedence as the implicit Union between adjacent items.
# '--' and '&&', for compatibility with Java, have lower precedence than
# the implicit Union operations. '--' and '&&' themselves
# have the same precedence, and group left to right.
#
"[[a-m]-[f-w]p]+" "<0>dep</0>fgwxyz"
"[^[a-m]-[f-w]p]+" "dep<0>fgwxyz</0>"
"[[a-m]--[f-w]p]+" "<0>de</0>pfgwxyz"
"[^[a-m]--[f-w]p]+" "de<0>pfgwxyz</0>"
"[[a-m]&[e-s]w]+" "<0>efmw</0>adnst"
"[^[a-m]&[e-s]w]+" "efmw<0>adnst</0>"
"[[a-m]&[e-s]]+" "<0>efm</0>adnst"
# {min,max} iteration qualifier
"A{3}BC" "<0>AAABC</0>"
"(ABC){2,3}AB" "no matchAB"
"(ABC){2,3}AB" "ABCAB"
"(ABC){2,3}AB" "<0>ABC<1>ABC</1>AB</0>"
"(ABC){2,3}AB" "<0>ABCABC<1>ABC</1>AB</0>"
"(ABC){2,3}AB" "<0>ABCABC<1>ABC</1>AB</0>CAB"
"(ABC){2}AB" "ABCAB"
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>"
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>CAB"
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>CABCAB"
"(ABC){2,}AB" "ABCAB"
"(ABC){2,}AB" "<0>ABC<1>ABC</1>AB</0>"
"(ABC){2,}AB" "<0>ABCABC<1>ABC</1>AB</0>"
"(ABC){2,}AB" "<0>ABCABCABC<1>ABC</1>AB</0>"
"X{0,0}ABC" "<0>ABC</0>"
"X{0,1}ABC" "<0>ABC</0>"
"(?:Hello(!{1,3}) there){1}" "Hello there"
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!</1> there</0>"
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!</1> there</0>"
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!!</1> there</0>"
"(?:Hello(!{1,3}) there){1}" "Hello!!!! there"
# Nongreedy {min,max}? intervals
"(ABC){2,3}?AB" "no matchAB"
"(ABC){2,3}?AB" "ABCAB"
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>"
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>CAB"
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>CABCAB"
"(ABC){2,3}?AX" "<0>ABCABC<1>ABC</1>AX</0>"
"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABC</1>AX</0>"
# Possessive {min,max}+ intervals
"(ABC){2,3}+ABC" "ABCABCABC"
"(ABC){1,2}+ABC" "<0>ABC<1>ABC</1>ABC</0>"
"(?:(.)\1){2,5}+." "<0>aabbcc<1>d</1>de</0>x"
# Atomic Grouping
"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string.
"(?>(abc{2,4}?))(c*)" "<0><1>abcc</1><2>ccc</2></0>ddd"
"(\.\d\d(?>[1-9]?))\d+" "1.625"
"(\.\d\d(?>[1-9]?))\d+" "1<0><1>.625</1>0</0>"
# Possessive *+
"(abc)*+a" "abcabcabc"
"(abc)*+a" "<0>abc<1>abc</1>a</0>b"
"(a*b)*+a" "<0><1>aaaab</1>a</0>aaa"
# Possessive ?+
"c?+ddd" "<0>cddd</0>"
"c?+cddd" "cddd"
"c?cddd" "<0>cddd</0>"
# Back Reference
"(?:ab(..)cd\1)*" "<0>ab23cd23ab<1>ww</1>cdww</0>abxxcdyy"
"ab(?:c|(d?))(\1)" "<0>ab<1><2></2></1></0>c"
"ab(?:c|(d?))(\1)" "<0>ab<1>d</1><2>d</2></0>"
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>e"
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>"
# Back References that hit/don't hit end
"(abcd) \1" z "abcd abc"
"(abcd) \1" Z "<0><1>abcd</1> abcd</0>"
"(abcd) \1" Z "<0><1>abcd</1> abcd</0> "
# Case Insensitive back references that hit/don't hit end.
"(abcd) \1" zi "abcd abc"
"(abcd) \1" Zi "<0><1>abcd</1> ABCD</0>"
"(abcd) \1" Zi "<0><1>abcd</1> ABCD</0> "
# Back references that hit/don't hit boundary limits.
"(abcd) \1" z "<r>abcd abc</r>d "
"(abcd) \1" Z "<r><0><1>abcd</1> abcd</0></r> "
"(abcd) \1" Z "<r><0><1>abcd</1> abcd</0> </r>"
"(abcd) \1" zi "<r>abcd abc</r>d "
"(abcd) \1" Zi "<r><0><1>abcd</1> abcd</0></r> "
"(abcd) \1" Zi "<r><0><1>abcd</1> abcd</0> </r>"
# Back reference that fails match near the end of input without actually hitting the end.
"(abcd) \1" ZL "abcd abd"
"(abcd) \1" ZLi "abcd abd"
# Back reference to a zero-length match. They are always a successful match.
"ab(x?)cd(\1)ef" "<0>ab<1></1>cd<2></2>ef</0>"
"ab(x?)cd(\1)ef" i "<0>ab<1></1>cd<2></2>ef</0>"
# Back refs to capture groups that didn't participate in the match.
"ab(?:(c)|(d))\1" "abde"
"ab(?:(c)|(d))\1" "<0>ab<1>c</1>c</0>e"
"ab(?:(c)|(d))\1" i "abde"
"ab(?:(c)|(d))\1" i "<0>ab<1>c</1>c</0>e"
# Named back references
"(?<one>abcd)\k<one>" "<0><1>abcd</1>abcd</0>"
"(no)?(?<one>abcd)\k<one>" "<0><2>abcd</2>abcd</0>"
"(?<a_1>...)" E " " # backref names are ascii letters & numbers only"
"(?<1a>...)" E " " # backref names must begin with a letter"
"(?<a>.)(?<a>.)" E " " # Repeated names are illegal.
# Case Insensitive
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
"(?:(?i)a)b" "<0>Ab</0>"
"ab(?i)cd" "<0>abCd</0>"
"ab$cd" "abcd"
"ssl" i "abc<0>ßl</0>xyz"
"ssl" i "abc<0>ẞl</0>xyz"
"FIND" i "can <0>find</0> ?" # fi ligature, \ufb01
"find" i "can <0>FIND</0> ?"
"ῧ" i "xxx<0>ῧ</0>xxx" # Composed char (match string) decomposes when case-folded (pattern)
# White space handling
"a b" "ab"
"abc " "abc"
"abc " "<0>abc </0>"
"ab[cd e]z" "<0>ab z</0>"
"ab\ c" "<0>ab c</0> "
"ab c" "<0>ab c</0> "
"ab c" x "ab c "
"ab\ c" x "<0>ab c</0> "
#
# Pattern Flags
#
"(?u)abc" "<0>abc</0>"
"(?-u)abc" "<0>abc</0>"
#
# \c escapes (Control-whatever)
#
"\cA" "<0>\u0001</0>"
"\ca" "<0>\u0001</0>"
"\c\x" "<0>\u001cx</0>"
#Multi-line mode
'b\s^' m "a\nb\n"
"(?m)^abc$" "abc \n abc\n<0>abc</0>\nabc"
"(?m)^abc$" 2 "abc \n abc\nabc\n<0>abc</0>"
"^abc$" 2 "abc \n abc\nabc\nabc"
# Empty and full range
"[\u0000-\U0010ffff]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
"[^\u0000-\U0010ffff]" "abc\u0000\uffff\U00010000\U0010ffffzz"
"[^a--a]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
# Free-spacing mode
"a b c # this is a comment" x "<0>abc</0> "
'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac</0>"
"a b c [x y z]" x "abc "
"a b c [x y z]" x "a b c "
"a b c [x y z]" x "<0>abcx</0>yz"
"a b c [x y z]" x "<0>abcy</0>yz"
#
# Look Behind
#
"(?<=a)b" "a<0>b</0>"
"(.*)(?<=[bc])" "<0><1>abc</1></0>d"
"(?<=(abc))def" "<1>abc</1><0>def</0>" # lookbehind precedes main match.
"(?<=ab|abc)xyz" "abwxyz" # ab matches, but not far enough.
"(?<=abc)cde" "abcde"
"(?<=abc|ab)cde" "ab<0>cde</0>"
"(?<=abc|ab)cde" "abc<0>cde</0>"
"(?<=bc?c?c?)cd" "ab<0>cd</0>"
"(?<=bc?c?c?)cd" "abc<0>cd</0>"
"(?<=bc?c?c?)cd" "abcc<0>cd</0>"
"(?<=bc?c?c?)cd" "abccc<0>cd</0>"
"(?<=bc?c?c?)cd" "abcccccd"
"(?<=bc?c?c?)c+d" "ab<0>cccccd</0>"
".*(?<=: ?)(\w*)" "<0>1:one 2: two 3:<1>three</1></0> "
#
# Named Characters
#
"a\N{LATIN SMALL LETTER B}c" "<0>abc</0>"
"a\N{LATIN SMALL LETTER B}c" i "<0>abc</0>"
"a\N{LATIN SMALL LETTER B}c" i "<0>aBc</0>"
"a\N{LATIN SMALL LETTER B}c" "aBc"
"\N{FULL STOP}*" "<0>...</0>abc"
"$" "abc<0></0>"
#
# Optimizations of .* at end of patterns
#
"abc.*" "<0>abcdef</0>"
"abc.*$" "<0>abcdef</0>"
"abc(.*)" "<0>abc<1>def</1></0>"
"abc(.*)" "<0>abc<1></1></0>"
"abc.*" "<0>abc</0>\ndef"
"abc.*" s "<0>abc\ndef</0>"
"abc.*$" s "<0>abc\ndef</0>"
"abc.*$" "abc\ndef"
"abc.*$" m "<0>abc</0>\ndef"
"abc.*\Z" m "abc\ndef"
"abc.*\Z" sm "<0>abc\ndef</0>"
"abc*" "<0>abccc</0>d"
"abc*$" "<0>abccc</0>"
"ab(?:ab[xyz]\s)*" "<0>ababy abx </0>abc"
"(?:(abc)|a)(?:bc)+" "<0>abc</0>"
"(?:(abc)|a)(?:bc)*" "<0><1>abc</1></0>"
"^[+\-]?[0-9]*\.?[0-9]*" "<0>123.456</0>"
"ab.+yz" "<0>abc12345xyz</0>ttt"
"ab.+yz" s "<0>abc12345xyz</0>ttt"
"ab.+yz" "abc123\n45xyzttt"
"ab.+yz" s "<0>abc12\n345xyz</0>ttt"
"ab[0-9]+yz" "---abyz+++"
"ab[0-9]+yz" "---<0>ab1yz</0>+++"
"ab[0-9]+yz" "---<0>ab12yz</0>+++"
"ab[0-9]+yz" "---<0>ab123456yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---abyz+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>1</1>yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>12</1>yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>A</1>yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>AB</1>yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>ABCDE</1>yz</0>+++"
#
# Hex format \x escaping
#
"ab\x63" "<0>abc</0>"
"ab\x09w" "<0>ab\u0009w</0>"
"ab\xabcdc" "<0>ab\u00abcdc</0>"
"ab\x{abcd}c" "<0>ab\uabcdc</0>"
"ab\x{101234}c" "<0>ab\U00101234c</0>"
"abα" "<0>abα</0>"
#
# Octal Escaping. This conforms to Java conventions, not Perl.
"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
"\0776" "<0>\u003f\u0036</0>" # overflow, the 6 is literal.
"\0376xyz" "<0>\u00fexyz</0>"
"\08" E "<0>\u00008</0>"
"\0" E "x"
#
# \u Surrogate Pairs
#
"\ud800\udc00" "<0>\U00010000</0>"
"\ud800\udc00*" "<0>\U00010000\U00010000\U00010000</0>\U00010001"
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
"(\ud800)(\udc00)" "\U00010000"
"\U00010001+" "<0>\U00010001\U00010001</0>\udc01"
#
# hitEnd with find()
#
"abc" Z "aa<0>abc</0> abcab"
"abc" 2Z "aaabc <0>abc</0>ab"
"abc" 3z "aa>abc abcab"
#
# \ escaping
#
"abc\jkl" "<0>abcjkl</0>" # escape of a non-special letter is just itself.
"abc[ \j]kl" "<0>abcjkl</0>"
#
# \R all newline sequences.
#
"abc\Rxyz" "<0>abc\u000axyz</0>gh"
"abc\Rxyz" "<0>abc\u000bxyz</0>gh"
"abc\Rxyz" "<0>abc\u000cxyz</0>gh"
"abc\Rxyz" "<0>abc\u000dxyz</0>gh"
"abc\Rxyz" "<0>abc\u0085xyz</0>gh"
"abc\Rxyz" "<0>abc\u2028xyz</0>gh"
"abc\Rxyz" "<0>abc\u2029xyz</0>gh"
"abc\Rxyz" "<0>abc\u000d\u000axyz</0>gh"
"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence.
"abc\r\nxyz" "<0>abc\u000d\u000axyz</0>gh"
"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches.
"abc\Rxyz" "abc\u000exyz"
"abc\Rxyz" "abc\u202axyz"
# \v \V single character new line sequences.
"abc\vxyz" "<0>abc\u000axyz</0>gh"
"abc\vxyz" "<0>abc\u000bxyz</0>gh"
"abc\vxyz" "<0>abc\u000cxyz</0>gh"
"abc\vxyz" "<0>abc\u000dxyz</0>gh"
"abc\vxyz" "<0>abc\u0085xyz</0>gh"
"abc\vxyz" "<0>abc\u2028xyz</0>gh"
"abc\vxyz" "<0>abc\u2029xyz</0>gh"
"abc\vxyz" "abc\u000d\u000axyzgh"
"abc\vxyz" "abc?xyzgh"
"abc[\v]xyz" "<0>abc\u000axyz</0>gh"
"abc[\v]xyz" "<0>abc\u000bxyz</0>gh"
"abc[\v]xyz" "<0>abc\u000cxyz</0>gh"
"abc[\v]xyz" "<0>abc\u000dxyz</0>gh"
"abc[\v]xyz" "<0>abc\u0085xyz</0>gh"
"abc[\v]xyz" "<0>abc\u2028xyz</0>gh"
"abc[\v]xyz" "<0>abc\u2029xyz</0>gh"
"abc[\v]xyz" "abc\u000d\u000axyzgh"
"abc[\v]xyz" "abc?xyzgh"
"abc\Vxyz" "abc\u000axyzgh"
"abc\Vxyz" "abc\u000bxyzgh"
"abc\Vxyz" "abc\u000cxyzgh"
"abc\Vxyz" "abc\u000dxyzgh"
"abc\Vxyz" "abc\u0085xyzgh"
"abc\Vxyz" "abc\u2028xyzgh"
"abc\Vxyz" "abc\u2029xyzgh"
"abc\Vxyz" "abc\u000d\u000axyzgh"
"abc\Vxyz" "<0>abc?xyz</0>gh"
# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab
"abc\hxyz" "<0>abc xyz</0>gh"
"abc\Hxyz" "abc xyzgh"
"abc\hxyz" "<0>abc\u2003xyz</0>gh"
"abc\Hxyz" "abc\u2003xyzgh"
"abc\hxyz" "<0>abc\u0009xyz</0>gh"
"abc\Hxyz" "abc\u0009xyzgh"
"abc\hxyz" "abc?xyzgh"
"abc\Hxyz" "<0>abc?xyz</0>gh"
"abc[\h]xyz" "<0>abc xyz</0>gh"
"abc[\H]xyz" "abc xyzgh"
"abc[\h]xyz" "<0>abc\u2003xyz</0>gh"
"abc[\H]xyz" "abc\u2003xyzgh"
"abc[\h]xyz" "<0>abc\u0009xyz</0>gh"
"abc[\H]xyz" "abc\u0009xyzgh"
"abc[\h]xyz" "abc?xyzgh"
"abc[\H]xyz" "<0>abc?xyz</0>gh"
#
# Bug xxxx
#
"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?" MG "<0>-1234-21-31T41:51:61.789+71:81</0>"
#
# A random, complex, meaningless pattern that should at least compile
#
"(?![^\<C\f\0146\0270\}&&[|\02-\x3E\}|X-\|]]{7,}+)[|\\\x98\<\?\u4FCFr\,\0025\}\004|\0025-\0521]|(?<![|\01-\u829E])|(?<!\p{Alpha})|^|(?-s:[^\x15\\\x24F\a\,\a\u97D8[\x38\a[\0224-\0306[^\0020-\u6A57]]]]??)(?xix:[^|\{\[\0367\t\e\x8C\{\[\074c\]V[|b\fu\r\0175\<\07f\066s[^D-\x5D]]])(?xx:^{5,}+)(?d)(?=^\D)|(?!\G)(?>\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\<b\0206\uF2EC\01m\,\ak\a\03&&\p{Punct}]]]])(?-dxs:[|\06-\07|\e-\x63&&[|Tp\u18A3\00\|\xE4\05\061\015\0116C|\r\{\}\006\xEA\0367\xC4\01\0042\0267\xBB\01T\}\0100\?[|\[-\u459B|\x23\x91\rF\0376[|\?-\x94\0113-\\\s]]]]{6}?)(?<=[^\t-\x42H\04\f\03\0172\?i\u97B6\e\f\uDAC2])(?=\B)(?>[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()" "<0></0>abc"
#
# Bug 3225
"1|9" "<0>1</0>"
"1|9" "<0>9</0>"
"1*|9" "<0>1</0>"
"1*|9" "<0></0>9"
"(?:a|ac)d" "<0>acd</0>"
"a|ac" "<0>a</0>c"
#
# Bug 3320
#
"(a([^ ]+)){0,} (c)" "<0><1>a<2>b</2></1> <3>c</3></0> "
"(a([^ ]+))* (c)" "<0><1>a<2>b</2></1> <3>c</3></0> "
#
# Bug 3436
#
"(.*?) *$" "<0><1>test</1> </0>"
#
# Bug 4034
#
"\D" "<0>A</0>BC\u00ffDEF"
"\d" "ABC\u00ffDEF"
"\D" "<0>\u00ff</0>DEF"
"\d" "\u00ffDEF"
"\D" "123<0>\u00ff</0>DEF"
"\D" "<0>\u0100</0>DEF"
"\D" "123<0>\u0100</0>DEF"
#
#bug 4024, new line sequence handling
#
"(?m)^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)^" 2 "AA\u000d\u000a<0></0>BB\u000d\u000aCC\u000d\u000a"
"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0></0>CC\u000d\u000a"
"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)$" "AA<0></0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)$" 2 "AA\u000d\u000aBB<0></0>\u000d\u000aCC\u000d\u000a"
"(?m)$" 3 "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
"(?m)$" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
"(?m)$" 5 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"$" "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
"$" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
"$" 3 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"$" "\u000a\u0000a<0></0>\u000a"
"$" 2 "\u000a\u0000a\u000a<0></0>"
"$" 3 "\u000a\u0000a\u000a"
"$" "<0></0>"
"$" 2 ""
"$" "<0></0>\u000a"
"$" 2 "\u000a<0></0>"
"$" 3 "\u000a"
"^" "<0></0>"
"^" 2 ""
"\Z" "<0></0>"
"\Z" 2 ""
"\Z" 2 "\u000a<0></0>"
"\Z" "<0></0>\u000d\u000a"
"\Z" 2 "\u000d\u000a<0></0>"
# No matching ^ at interior new-lines if not in multi-line mode.
"^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"^" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
#
# Dot-matches-any mode, and stopping at new-lines if off.
#
"." "<0>1</0>23\u000aXYZ"
"." 2 "1<0>2</0>3\u000aXYZ"
"." 3 "12<0>3</0>\u000aXYZ"
"." 4 "123\u000a<0>X</0>YZ" # . doesn't match newlines
"." 4 "123\u000b<0>X</0>YZ"
"." 4 "123\u000c<0>X</0>YZ"
"." 4 "123\u000d<0>X</0>YZ"
"." 4 "123\u000d\u000a<0>X</0>YZ"
"." 4 "123\u0085<0>X</0>YZ"
"." 4 "123\u2028<0>X</0>YZ"
"." 4 "123\u2029<0>X</0>YZ"
"." 4s "123<0>\u000a</0>XYZ" # . matches any
"." 4s "123<0>\u000b</0>XYZ"
"." 4s "123<0>\u000c</0>XYZ"
"." 4s "123<0>\u000d</0>XYZ"
"." 4s "123<0>\u000d\u000a</0>XYZ"
"." 4s "123<0>\u0085</0>XYZ"
"." 4s "123<0>\u2028</0>XYZ"
"." 4s "123<0>\u2029</0>XYZ"
".{6}" "123\u000a\u000dXYZ"
".{6}" s "<0>123\u000a\u000dX</0>Y"
#
# Ranges
#
".*" "abc<r><0>def</0></r>ghi"
"a" "aaa<r><0>a</0>aa</r>aaa"
"a" 2 "aaa<r>a<0>a</0>a</r>aaa"
"a" 3 "aaa<r>aa<0>a</0></r>aaa"
"a" 4 "aaa<r>aaa</r>aaa"
"a" "aaa<r><0>a</0>aa</r>aaa"
#
# [set] parsing, systematically run through all of the parser states.
#
#
"[def]+" "abc<0>ddeeff</0>ghi" # set-open
"[^def]+" "<0>abc</0>defghi"
"[:digit:]+" "abc<0>123</0>def"
"[:^digit:]+" "<0>abc</0>123def"
"[\u005edef]+" "abc<0>de^f</0>ghi"
"[]]+" "abc<0>]]]</0>[def" # set-open2
"[^]]+" "<0>abc</0>]]][def"
"[:Lu:]+" "abc<0>ABC</0>def" # set-posix
"[:Lu]+" "abc<0>uL::Lu</0>"
"[:^Lu]+" "abc<0>uL:^:Lu</0>"
"[:]+" "abc<0>:::</0>def"
"[:whats this:]" E " "
"[--]+" dE "-------"
"[[nested]]+" "xyz[<0>nnetsteed</0>]abc" #set-start
"[\x{41}]+" "CB<0>AA</0>ZYX"
"[\[\]\\]+" "&*<0>[]\\</0>..."
"[*({<]+" "^&<0>{{(<<*</0>)))"
"[-def]+" "abc<0>def-ef-d</0>xyz" # set-start-dash
"[abc[--def]]" E " "
"[x[&def]]+" "abc<0>def&</0>ghi" # set-start-amp
"[&& is bad at start]" E " "
"[abc" E " " # set-after-lit
"[def]]" "abcdef"
"[def]]" "abcde<0>f]</0>]"
"[[def][ghi]]+" "abc]<0>defghi</0>[xyz" # set-after-set
"[[def]ghi]+" "abc]<0>defghi</0>[xyz"
"[[[[[[[[[[[abc]" E " "
"[[abc]\p{Lu}]+" "def<0>abcABC</0>xyz"
"[d-f]+" "abc<0>def</0>ghi" # set-after-range
"[d-f[x-z]]+" "abc<0>defxyzzz</0>gw"
"[\s\d]+" "abc<0> 123</0>def"
"[d-f\d]+" "abc<0>def123</0>ghi"
"[d-fr-t]+" "abc<0>defrst</0>uvw"
"[abc--]" E " " # set-after-op
"[[def]&&]" E " "
"[-abcd---]+" "<0>abc</0>--" #[-abcd]--[-]
"[&abcd&&&ac]+" "b<0>ac&&ca</0>d" #[&abcd]&&[&ac]
"[[abcd]&[ac]]+" "b<0>acac</0>d" # set-set-amp
"[[abcd]&&[ac]]+" "b<0>acac</0>d"
"[[abcd]&&ac]+" "b<0>acac</0>d"
"[[abcd]&ac]+" "<0>bacacd&&&</0>"
"[abcd&[ac]]+" "<0>bacacd&&&</0>" #set-lit-amp
"[abcd&&[ac]]+" "b<0>acac</0>d"
"[abcd&&ac]+" "b<0>acac</0>d"
"[[abcd]-[ac]]+" "a<0>bdbd</0>c" # set-set-dash
"[[abcd]--[ac]]+" "a<0>bdbd</0>c"
"[[abcd]--ac]+" "a<0>bdbd</0>c"
"[[abcd]-ac]+" "<0>bacacd---</0>"
"[a-d--[b-c]]+" "b<0>adad</0>c" # set-range-dash
"[a-d--b-c]+" "b<0>adad</0>c"
"[a-d-[b-c]]+" "<0>bad-adc</0>"
"[a-d-b-c]+" "<0>bad-adc</0>"
"[\w--[b-c]]+" "b<0>adad</0>c"
"[\w--b-c]+" "b<0>adad</0>c"
"[\w-[b-c]]+" "<0>bad-adc</0>"
"[\w-b-c]+" "<0>bad-adc</0>"
"[a-d&&[b-c]]+" "a<0>bcbc</0>d" # set-range-amp
"[a-d&&b-c]+" "a<0>bcbc</0>d"
"[a-d&[b-c]]+" "<0>abc&bcd</0>"
"[a-d&b-c]+" "<0>abc&bcd</0>"
"[abcd--bc]+" "b<0>adda</0>c" # set-lit-dash
"[abcd--[bc]]+" "b<0>adda</0>c"
"[abcd-[bc]]+" "<0>bad--dac</0>xyz"
"[abcd-]+" "<0>bad--dac</0>xyz"
"[abcd-\s]+" E "xyz<0>abcd --</0>xyz" # set-lit-dash-esc
"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefg</0>hij-"
"[bcd-\{]+" "a<0>bcdefyz{</0>|}"
"[\p{Ll}]+" "ABC<0>abc</0>^&*&" # set-escape
"[\P{Ll}]+" "abc<0>ABC^&*&</0>xyz"
"[\N{LATIN SMALL LETTER Q}]+" "mnop<0>qqq</0>rst"
"[\sa]+" "cb<0>a a </0>(*&"
"[\S]+" " <0>hello</0> "
"[\w]+" " <0>hello_world</0>! "
"[\W]+" "a<0> *$%#,</0>hello "
"[\d]+" "abc<0>123</0>def"
"[\D]+" "123<0>abc</0>567"
"[\$\#]+" "123<0>$#$#</0>\\"
#
# Try each of the Java compatibility properties.
# These are checked here, while normal Unicode properties aren't, because
# these Java compatibility properties are implemented directly by regexp, while other
# properties are handled by ICU's Property and UnicodeSet APIs.
#
# These tests are only to verify that the names are recognized and the
# implementation isn't dead. They are not intended to verify that the
# function definitions are 100% correct.
#
"[:InBasic Latin:]+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
"[:^InBasic Latin:]+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InGreek}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InCombining Marks for Symbols}" "<0>\u20d0</0>"
"\p{Incombiningmarksforsymbols}" "<0>\u20d0</0>"
"\p{javaDefined}+" "\uffff<0>abcd</0>\U00045678"
"\p{javaDigit}+" "abc<0>1234</0>xyz"
"\p{javaIdentifierIgnorable}+" "abc<0>\u0000\u000e\u009f</0>xyz"
"\p{javaISOControl}+" "abc<0>\u0000\u000d\u0083</0>xyz"
"\p{javaJavaIdentifierPart}+" "#@!<0>abc123_$</0>;"
"\p{javaJavaIdentifierStart}+" "123\u0301<0>abc$_</0>%^&"
"\p{javaLetter}+" "123<0>abcDEF</0>&*()("
"\p{javaLetterOrDigit}+" "$%^&*<0>123abcகஙசஜஞ</0>☺♘♚☔☎♬⚄⚡"
"\p{javaLowerCase}+" "ABC<0>def</0>&^%#:="
"\p{javaMirrored}+" "ab$%<0>(){}[]</0>xyz"
"\p{javaSpaceChar}+" "abc<0> \u00ao\u2028</0>!@#"
"\p{javaSupplementaryCodePoint}+" "abc\uffff<0>\U00010000\U0010ffff</0>\u0000"
"\p{javaTitleCase}+" "abCE<0>Džῌᾨ</0>123"
"\p{javaUnicodeIdentifierStart}+" "123<0>abcⅣ</0>%^&&*"
"\p{javaUnicodeIdentifierPart}+" "%&&^<0>abc123\u0301\u0002</0>..."
"\p{javaUpperCase}+" "abc<0>ABC</0>123"
"\p{javaValidCodePoint}+" "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff</0>"
"\p{javaWhitespace}+" "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u2028</0>42"
"\p{all}+" "<0>123\u0000\U0010ffff</0>"
"\P{all}+" "123\u0000\U0010ffff"
# [:word:] is implemented directly by regexp. Not a java compat property, but PCRE and others.
"[:word:]+" ".??$<0>abc123ΓΔΕΖΗ_</0>%%%"
"\P{WORD}+" "<0>.??$</0>abc123ΓΔΕΖΗ_%%%"
#
# Errors on unrecognized ASCII letter escape sequences.
#
"[abc\Y]+" "<0>abcY</0>"
"[abc\Y]+" eE "<0>abcY</0>"
"(?:a|b|c|\Y)+" "<0>abcY</0>"
"(?:a|b|c|\Y)+" eE "<0>abcY</0>"
"\Q\Y\E" e "<0>\\Y</0>"
#
# Reported problem
#
"[a-\w]" E "x"
#
# Bug 4045
#
"A*" "<0>AAAA</0>"
"A*" 2 "AAAA<0></0>"
"A*" 3 "AAAA"
"A*" 4 "AAAA"
"A*" 5 "AAAA"
"A*" 6 "AAAA"
"A*" "<0></0>"
"A*" 2 ""
"A*" 3 ""
"A*" 4 ""
"A*" 5 ""
#
# Bug 4046
#
"(?m)^" "<0></0>AA\u000dBB\u000dCC\u000d"
"(?m)^" 2 "AA\u000d<0></0>BB\u000dCC\u000d"
"(?m)^" 3 "AA\u000dBB\u000d<0></0>CC\u000d"
"(?m)^" 4 "AA\u000dBB\u000dCC\u000d"
"(?m)^" 5 "AA\u000dBB\u000dCC\u000d"
"(?m)^" 6 "AA\u000dBB\u000dCC\u000d"
"(?m)^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)^" 2 "AA\u000d\u000a<0></0>BB\u000d\u000aCC\u000d\u000a"
"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0></0>CC\u000d\u000a"
"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
#
# Bug 4059
#
"\w+" "<0>イチロー</0>"
"\b....\b." "<0>イチロー?</0>"
#
# Bug 4058 ICU Unicode Set patterns have an odd feature -
# A $ as the last character before the close bracket means match
# a \uffff, which means off the end of the string in transliterators.
# Didn't make sense for regular expressions, and is now fixed.
#
"[\$](P|C|D);" "<0>$<1>P</1>;</0>"
"[$](P|C|D);" "<0>$<1>P</1>;</0>"
"[$$](P|C|D);" "<0>$<1>P</1>;</0>"
#
# bug 4888 Flag settings lost in some cases.
#
"((a){2})|(#)" is "no"
"((a){2})|(#)" is "<0><1>a<2>a</2></1></0>#"
"((a){2})|(#)" is "a<0><3>#</3></0>"
"((a|b){2})|c" is "<0>c</0>"
"((a|b){2})|c" is "<0>C</0>"
"((a|b){2})|c" s "C"
#
# bug 5617 ZWJ \u200d shouldn't cause word boundaries
#
".+?\b" "<0> </0>\u0935\u0915\u094D\u200D\u0924\u0947 "
".+?\b" 2 " <0>\u0935\u0915\u094D\u200D\u0924\u0947</0> "
".+?\b" 3 " \u0935\u0915\u094D\u200D\u0924\u0947 "
#
# bug 5386 "^.*$" should match empty input
#
"^.*$" "<0></0>"
"^.*$" m "<0></0>"
"^.*$" "<0></0>\n"
"(?s)^.*$" "<0>\n</0>"
#
# bug 5386 Empty pattern and empty input should match.
#
"" "<0></0>abc"
"" "<0></0>"
#
# bug 5386 Range upper and lower bounds can be equal
#
"[a-a]" "<0>a</0>"
#
# bug 5386 $* should not fail, should match empty string.
#
"$*" "<0></0>abc"
#
# bug 5386 \Q ... \E escaping problem
#
"[a-z\Q-$\E]+" "QE<0>abc-def$</0>."
# More reported 5386 Java comaptibility failures
#
"[^]*abb]*" "<0>kkkk</0>"
"\xa" "huh" # Java would like to be warned.
"^.*$" "<0></0>"
#
# bug 5386 Empty left alternation should produce a zero length match.
#
"|a" "<0></0>a"
"$|ab" "<0>ab</0>"
"$|ba" "ab<0></0>"
#
# bug 5386 Java compatibility for set expressions
#
"[a-z&&[cde]]+" "ab<0>cde</0>fg"
#
# bug 6019 matches() needs to backtrack and check for a longer match if the
# first match(es) found don't match the entire input.
#
"a?|b" "<0></0>b"
"a?|b" M "<0>b</0>"
"a?|.*?u|stuff|d" M "<0>stuff</0>"
"a?|.*?(u)|stuff|d" M "<0>stuff<1>u</1></0>"
"a+?" "<0>a</0>aaaaaaaaaaaa"
"a+?" M "<0>aaaaaaaaaaaaa</0>"
#
# Bug 7724. Expression to validate zip codes.
#
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "<0><1>94040</1><2>-3344</2></0>"
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "94040-0000"
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "00000-3344"
#
# Bug 8666. Assertion failure on match, bad operand to JMP_SAV_X opcode.
#
"((.??)+|A)*" "<0><1><2></2></1></0>AAAAABBBBBCCCCCDDDDEEEEE"
#
# Bug 8826. Incorrect results with case insensitive matches.
#
"AS(X)" i "aßx"
"AS.*" i "aßx" # Expansion of sharp s can't split between pattern terms.
"ASßS" i "<0>aßß</0>" # All one literal string, does match.
"ASß{1}S" i "aßß" # Pattern with terms, no match.
"aßx" i "<0>assx</0>"
"aßx" i "<0>ASSX</0>"
"aßx" i "<0>aßx</0>"
"ASS(.)" i "<0>aß<1>x</1></0>"
# Case Insensitive, probe some corner cases.
"ass+" i "aß" # Second 's' in pattern is qualified, can't combine with first.
"as+" i "aß"
"aßs" i "as" # Can't match half of a ß
"aß+" i "<0>assssssss</0>s"
"aß+" i "<0>assßSssSSS</0>s"
"a(ß?)+" i "<0>assssssss<1></1></0>s"
"a(ß?)+" i "<0>a<1></1></0>zzzzzzzzs"
"\U00010400" i "<0>\U00010428</0>" # case folded supplemental code point.
"sstuff" i "<0>ßtuff</0>" # exercise optimizations on what chars can start a match.
"sstuff" i "s<0>ßtuff</0>" # exercise optimizations on what chars can start a match.
"ßtuff" i "s<0>sstuff</0>"
"ßtuff" i "s<0>Sstuff</0>"
"a(..)\1" i "<0>A<1>bc</1>BC</0>def"
"(ß)\1" i "aa<0><1>ss</1>ß</0>zz" # Case insensitive back reference
"..(.)\1" i "<0>aa<1>ß</1>ss</0>"
"ab(..)\1" i "xx<0>ab<1>ss</1>ß</0>ss"
" (ss) ((\1.*)|(.*))" i "<0> <1>ss</1> <2><4>sß</4></2></0>" # The back reference 'ss' must not match in 'sß'
# Bug 9057
# \u200c and \u200d should be word characters.
#
"\w+" " <0>abc\u200cdef\u200dghi</0> "
"\w+" i " <0>abc\u200cdef\u200dghi</0> "
"[\w]+" " <0>abc\u200cdef\u200dghi</0> "
"[\w]+" i " <0>abc\u200cdef\u200dghi</0> "
# Bug 9283
# uregex_open fails for look-behind assertion + case-insensitive
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
# Bug 9719 Loop breaking on (zero length match){3,} (unlimited upper bound).
#
"(?:abc){1,}abc" "<0>abcabcabcabcabc</0>"
"(?:2*){2,}?a2\z" "<0>2a2</0>"
"(?:2*){2,}?a2\z" "2a3"
"(?:x?+){3,}+yz" "w<0>yz</0>"
"(2*){2,}?a2\\z" "2a3"
"(2*){2,}?a2\\z" "<0>2<1></1>a2\\z</0>"
"(2*){2,}?a2\z" "<0>2<1></1>a2</0>"
# Bug 10024
# Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
# Unbounded match is disallowed in look-behind expressions.
# Max match length is used to limit where to check for look-behind matches.
"(?<=a{1,5})bc" "aaaa<0>bc</0>def"
"(?<=(?:aa){3,20})bc" "aaaaaa<0>bc</0>def"
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "def jkl"
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "rst <0>jkl</0>"
"(?<=a{11})bc" "aaaaaaaaaaa<0>bc</0>"
"(?<=a{11})bc" "aaaaaaaaaabc"
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
# Bug 10835
# Match Start Set not being correctly computed for case insensitive patterns.
# (Test here is to dump the compiled pattern & manually check the start set.)
"(private|secret|confidential|classified|restricted)" i "hmm, <0><1>Classified</1></0> stuff"
"(private|secret|confidential|classified|restricted)" "hmm, Classified stuff"
# Bug 10844
"^([\w\d:]+)$" "<0><1>DiesIst1Beispiel:text</1></0>"
"^([\w\d:]+)$" i "<0><1>DiesIst1Beispiel:text</1></0>"
"^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text</1></0>"
"^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text</1></0>"
# Bug 11049
# Edge cases in find() when pattern match begins with set of code points
# and the match begins at the end of the string.
"A|B|C" "hello <0>A</0>"
"A|B|C" "hello \U00011234"
"A|B|\U00012345" "hello <0>\U00012345</0>"
"A|B|\U00010000" "hello \ud800"
# Bug 11369
# Incorrect optimization of patterns with a zero length quantifier {0}
"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE"
"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>"
"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>"
"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>"
"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>"
# Bug 11370
# Max match length computation of look-behind expression gives result that is too big to fit in the
# in the 24 bit operand portion of the compiled code. Expressions should fail to compile
# (Look-behind match length must be bounded. This case is treated as unbounded, an error.)
"(?<!(0123456789a){10000000})x" E "no match"
"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match"
# Bug 11374 Bad integer overflow check in number conversion.
# 4294967300 converts to 4 with 32 bit overflow.
"x{4294967300}" E "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
"x{0,4294967300}" E "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Bug 11373
#
# Overflow checking in max match length computation for loops.
# Value here is 10 * 100000 * 3000 = 3E9, overflowing a 32 bit signed value.
# Before fixing, this case gave an assertion failure.
"(?<=((0123456789){100000}){3000})abc" E "abc"
# Bug 11507 Capture of an unpaired surrogate shouldn't allow a back reference to
# match half of a surrogate pair, but only another unpaired surrogate.
#
"pre(.)post\1" "pre\ud800post\ud800\udc00"
"pre(.)post\1" "<0>pre<1>\ud800</1>post\ud800</0> fin"
"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path
"pre(.)post\1" i "<0>pre<1>\ud800</1>post\ud800</0> fin"
# Bug 11554
#
# Maximum match length computation was assuming UTF-16.
# Used in look-behind matches to constrain how far back to look.
"(?<=a\x{100000})spam" "***a\x{100000}<0>spam</0>**"
"(?<=aą)spam" "**aą<0>spam</0>**"
"(?<=ąabc)spam" "**ąabc<0>spam</0>**"
"(?<=a\x{100000})spam" "***a\x{100001}spam**"
"(?<=aą)spam" "**bąspam**"
"(?<=ąabc)spam" "**ąabxspam**"
# with negative look-behind
"(?<!a\x{100000})spam" "***a\x{100000}spam**"
"(?<!aą)spam" "**aąspam**"
"(?<!ąabc)spam" "**ąabcspam**"
"(?<!a\x{100000})spam" "***a\x{100001}<0>spam</0>**"
"(?<!aą)spam" "**bą<0>spam</0>**"
"(?<!ąabc)spam" "**ąabx<0>spam</0>**"
# Bug #12930
#
# Minimum Match Length computation, int32_t overflow on an empty set in the pattern.
# The empty set, with no match possible, has a min match length of INT32_MAX.
# Was incremented subsequently. Caused assertion failure on pattern compile.
"[^\u0000-\U0010ffff]bc?" "bc no match"
"[^\u0000-\U0010ffff]?bc?" "<0>bc</0> has a match"
# Bug #12160 Hit End behavior after find fails to find.
# To match Java, should be true if find fails to find.
#
"abc" Z "<0>abc</0> abc abc xyz"
"abc" Z2 "abc <0>abc</0> abc xyz"
"abc" Z3 "abc abc <0>abc</0> xyz"
"abc" z4 "abc abc abc xyz"
# Bug #13844 Verify that non-standard Java property names are recognized.
"[\p{IsAlphabetic}]" " <0>A</0>"
"[\P{IsAlphabetic}]" "A<0> </0>"
"[\p{IsIdeographic}]" "A<0>〆</0>"
"[\P{IsIdeographic}]" "〆<0>A</0>"
"[\p{IsLetter}]" " <0>A</0>"
"[\P{IsLetter}]" "A<0> </0>"
"[\p{Letter}]" " <0>A</0>"
"[\p{IsLowercase}]" "A<0>a</0>"
"[\P{IsLowercase}]" "a<0>A</0>"
"[\p{IsUppercase}]" "a<0>A</0>"
"[\P{IsUppercase}]" "A<0>a</0>"
"[\p{IsTitlecase}]" "D<0>Dz</0>"
"[\P{IsTitlecase}]" "Dz<0>D</0>"
"[\p{IsPunctuation}]" " <0>&</0>"
"[\P{IsPunctuation}]" "&<0> </0>"
"[\p{IsControl}]" " <0>\x{82}</0>"
"[\P{IsControl}]" "\x{82}<0> </0>"
"[\p{IsWhite_Space}]" "x<0> </0>"
"[\P{IsWhite_Space}]" " <0>x</0>"
"[\p{IsDigit}]" " <0>4</0>"
"[\P{IsDigit}]" "4<0> </0>"
"[\p{IsHex_Digit}]" " <0>F</0>"
"[\P{IsHex_Digit}]" "F<0> </0>"
"[\p{IsJoin_Control}]" " <0>\x{200d}</0>"
"[\P{IsJoin_Control}]" "\x{200d}<0> </0>"
"[\p{IsNoncharacter_Code_Point}]" "A<0>\x{5fffe}</0>"
"[\p{IsAssigned}]" "\x{10ffff}<0>a</0>"
"[\P{IsAssigned}]" "a<0>\x{10ffff}</0>"
"[\p{InBasic Latin}]" "〆<0>A</0>"
"[\p{InBasicLatin}]" "〆<0>A</0>"
"[\p{InBasic-Latin}]" "〆<0>A</0>" # ICU accepts '-'; Java does not.
"[\p{InBasic_Latin}]" "〆<0>A</0>"
"[\p{Inbasiclatin}]" "〆<0>A</0>"
"[\p{inbasiclatin}]" E "〆<0>A</0>" # "In" must be cased as shown. Property name part is case insensitive.
"[\p{InCombining_Marks_for_Symbols}]" "a<0>\x{20DD}</0>" # COMBINING ENCLOSING CIRCLE
"[\p{all}]*" "<0>\x{00}abc\x{10ffff}</0>"
"[\p{javaBadProperty}]" E "whatever"
"[\p{IsBadProperty}]" E "whatever"
"[\p{InBadBlock}]" E "whatever"
"[\p{In}]" E "whatever"
"[\p{Is}]" E "whatever"
"[\p{java}]" "x<0>ꦉ</0>" # Note: "java" is a valid script code.
"[\p{javaLowerCase}]+" "A<0>a</0>"
"[\p{javaLowerCase}]+" i "<0>Aa</0>"
"[\P{javaLowerCase}]+" "<0>A</0>a"
"[\P{javaLowerCase}]+" i "Aa" # No Match because case fold of the set happens first, then negation.
# JDK is not case insensitive w named properties, even though
# the insensitive match flag is set. A JDK bug?
"[a-z]+" i "<0>Aa</0>" # Matches JDK behavior.
"[^a-z]+" i "Aa" # (no match) which is JDK behavior. Case fold first, then negation.
# Bug 20385. Assertion failure while compiling a negative look-behind expression consisting of a set with
# no contents. Meaning the [set] can never match. There is no syntax to directly express
# an empty set, so generate it by negating (^) a set of all code points.
# Also check empty sets in other contexts.
"(?<![^[^a]a])" "<0></0>abc"
"(?<![^\u0000-\U0010ffff])" "<0></0>abc"
"x(?<![^\u0000-\U0010ffff])" "<0>x</0>abc"
"x(?<![^\u0000-\U0010ffff]{1,5})" "<0>x</0>abc"
"x(?<![^\u0000-\U0010ffff]{0,5})" "xabc"
"(?<=[^\u0000-\U0010ffff])" "abc"
"(x?<=[^\u0000-\U0010ffff])" "abc"
"x(?<=[^\u0000-\U0010ffff]{1,5})" "xabc"
"x(?<=[^\u0000-\U0010ffff]{0,5})" "<0>x</0>abc"
"[^\u0000-\U0010ffff]" "a"
"[^[^\u0000-\U0010ffff]]" "<0>a</0>"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
# Bug ICU-20544. Similar to 20385, above. Assertion failure with a negative look-behind assertion containing
--> --------------------
--> maximum size reached
--> --------------------