<?
xml version=
"1.0" encoding=
"UTF-8"?>
<!-- Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html -->
<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
<!-- Test data file for string search -->
<!
DOCTYPE stringsearch-tests [
<!
ELEMENT stringsearch-tests (test-case+)>
<!
ATTLIST stringsearch-tests debug
IDREF #IMPLIED >
<!
ELEMENT test-case (pattern, pre?, m?, post?)>
<!
ATTLIST test-case
id ID #REQUIRED
locale
CDATA "en"
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL)
"TERTIARY"
norm (ON | OFF)
"OFF"
alternate_handling (NON_IGNORABLE | SHIFTED)
"NON_IGNORABLE"
>
<!
ELEMENT pattern (#
PCDATA)>
<!
ELEMENT pre (#
PCDATA)>
<!
ELEMENT m (#
PCDATA)>
<!
ELEMENT post (#
PCDATA)>
]>
<stringsearch-tests>
<!-- debug="test11" (for copying into the above element) -->
<!-- Very simple match -->
<test-case
id=
"test01" >
<pattern>abc</pattern>
<pre>xxx</pre><m>abc</m><post>yyy</post>
</test-case>
<!-- Very simple no-match -->
<test-case
id=
"test02" >
<pattern>abc</pattern>
<pre>xxx</pre><post>yyy</post>
</test-case>
<!-- Match after several near-misses. -->
<test-case
id=
"test03" >
<pattern>string</pattern>
<pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
</test-case>
<test-case
id=
"test04" strength=
"PRIMARY" >
<pattern>FUSS</pattern>
<pre>abc</pre><m>fuss</m><post>sss</post>
</test-case>
<test-case
id=
"test05" strength=
"PRIMARY" >
<pattern>FUSS</pattern>
<pre>abc</pre><m>fuß</m><post>sss</post>
</test-case>
<test-case
id=
"test05.5" strength=
"PRIMARY" >
<pattern>fuss</pattern>
<pre>a </pre>
<m>fuß</m>
<post>ball table</post>
</test-case>
<test-case
id=
"test06" strength=
"PRIMARY" >
<pattern>fuß</pattern>
<pre>abc</pre><m>fuss</m><post>xyz</post>
</test-case>
<test-case
id=
"test07" strength=
"SECONDARY" >
<pattern>fuß</pattern>
<pre>abcfussxyz</pre>
</test-case>
<test-case
id=
"test08" strength=
"PRIMARY" >
<pattern>fus</pattern>
<pre>abcfuß</pre><post>xyz</post>
</test-case>
<!-- A good match following an initial match that failed because
of not ending on a character boundary -->
<test-case
id=
"test09" strength=
"PRIMARY">
<pattern>fus</pattern>
<pre>fuß </pre><m>fus</m><post>sss</post>
</test-case>
<!-- Test cases from usrchdat.inc BREAKITERATOREXACT -->
<test-case
id=
"test10" strength=
"TERTIARY">
<pattern>fox</pattern>
<m>fox</m><post>y fox</post>
</test-case>
<test-case
id=
"test11" strength=
"PRIMARY" locale=
"de_DE@collation=phonebook">
<pattern>toe</pattern>
<pre>This is a </pre><m>Tö</m><post>ne</post>
</test-case>
<test-case
id=
"test11a" strength=
"SECONDARY" locale=
"de_DE@collation=phonebook">
<pattern>toe</pattern>
<pre>This is a </pre><post>Töne</post>
</test-case>
<test-case
id=
"test12" strength=
"TERTIARY">
<pattern>e</pattern>
<pre>tésting that é doés not match </pre><m>e</m><post></post>
</test-case>
<test-case
id=
"test13" strength=
"PRIMARY" locale=
"fr">
<pattern>e</pattern>
<pre></pre><m>É</m><post>É</post>
</test-case>
<test-case
id=
"test14" strength=
"PRIMARY" locale=
"fr">
<pattern>O</pattern>
<pre>C</pre><m>O\u0302</m><post>TÉ</post>
</test-case>
<!-- Test cases from usrchdat.inc STRENGTH -->
<test-case
id=
"test15" strength=
"PRIMARY" locale=
"en">
<pattern>fox</pattern>
<pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
</test-case>
<test-case
id=
"test16" strength=
"PRIMARY" locale=
"fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
</test-case>
<test-case
id=
"test17" strength=
"PRIMARY" locale=
"fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
</test-case>
<test-case
id=
"test18" strength=
"PRIMARY" locale=
"fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
</test-case>
<test-case
id=
"test19" strength=
"PRIMARY" locale=
"fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
</test-case>
<test-case
id=
"test20" strength=
"PRIMARY" locale=
"es">
<pattern>channel</pattern>
<pre>A </pre><m>channel</m><post>, </post>
</test-case>
<test-case
id=
"test21" strength=
"PRIMARY" locale=
"es">
<pattern>channel</pattern>
<pre>A </pre><m>CHANNEL</m><post>, </post>
</test-case>
<test-case
id=
"test22" strength=
"PRIMARY" locale=
"es">
<pattern>channel</pattern>
<pre>A </pre><m>Channel</m><post>s, </post>
</test-case>
<test-case
id=
"test23" strength=
"PRIMARY" locale=
"es">
<pattern>channel</pattern>
<pre>A </pre><m>channel</m><post>... </post>
</test-case>
<test-case
id=
"test24" strength=
"TERTIARY" locale=
"en">
<pattern>A\u0300</pattern>
<pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A
"
</test-case>
<!-- TODO: In the original test data, this test matched at IDENTICAL strength.
Doesn't seem right. The characters are different.
-->
<test-case
id=
"test24a" strength=
"IDENTICAL" locale=
"en">
<pattern>A\u0300</pattern>
<pre>At IDENTICAL, should this match? </pre><m>\u00c0</m><post></post>
</test-case>
<test-case
id=
"test24b" strength=
"IDENTICAL" alternate_handling=
"SHIFTED" locale=
"en">
<pattern>A\u0300</pattern>
<pre>At IDENTICAL, should this match? </pre>
<m>\u00c0</m>
<post></post>
</test-case>
<test-case
id=
"test25" strength=
"SECONDARY" locale=
"en">
<pattern>Ű</pattern>
<pre>12</pre><m>ű</m><post> Ű</post>
</test-case>
<test-case
id=
"test26" strength=
"SECONDARY" locale=
"en">
<pattern>A</pattern>
<pre>12</pre><m>a</m><post>...</post>
</test-case>
<!-- Test Cases from usrchdat.inc, VARIABLE -->
<test-case
id=
"test27" strength=
"TERTIARY" locale=
"en">
<pattern>blackbird</pattern>
<pre>black-bird </pre><m>blackbird</m><post>...</post>
</test-case>
<test-case
id=
"test28" strength=
"TERTIARY" locale=
"en">
<pattern>go</pattern>
<pre> on</pre>
</test-case>
<!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
the UStringSearch. How did the original test run? -->
<!--
<test-case id="test29" strength="PRIMARY" locale="en">
<pattern> </pattern>
<pre></pre><m></m><post>abc</post>
</test-case>
-->
<test-case
id=
"test30" strength=
"SECONDARY" locale=
"en">
<pattern>abc</pattern>
<pre> a bc ab c a bc ab c
"
</test-case>
<test-case
id=
"test31" strength=
"SECONDARY" locale=
"en">
<pattern>abc</pattern>
<pre> ---------------</pre>
</test-case>
<!-- Normalization test cases from usrchdat.inc -->
<test-case
id=
"test32" strength=
"TERTIARY" norm=
"ON">
<pattern>a\u0325\u0300</pattern>
<pre></pre><m>a\u0300\u0325</m>
</test-case>
<test-case
id=
"test32a" strength=
"TERTIARY" norm=
"OFF">
<pattern>a\u0325\u0300</pattern>
<pre>a\u0300\u0325</pre>
</test-case>
<!-- COMPOSITEBOUNDARIES from usrchdat.inc
Boundaries are not identical to original test data because
of matching only full combining sequences
-->
<test-case
id=
"test40" strength=
"TERTIARY">
<pattern>A</pattern>
<pre>À</pre>
<!-- \u00C0 -->
</test-case>
<test-case
id=
"test41" strength=
"TERTIARY">
<pattern>A</pattern>
<pre>À</pre><m>A</m><post>C</post>
</test-case>
<test-case
id=
"test42" strength=
"TERTIARY">
<pattern>A\u030A</pattern>
<pre>À\u01FA</pre>
</test-case>
<!-- SUPPLEMENTARYCANONICAL from usrchdat.inc -->
<test-case
id=
"test50" strength=
"TERTIARY">
<pattern>\uD800\uDC00</pattern>
<pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
<post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
</test-case>
<test-case
id=
"test51" strength=
"TERTIARY">
<pattern>\\uD834\\uDDB9</pattern>
<pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
</test-case>
<test-case
id=
"test52" strength=
"TERTIARY">
<pattern> \\uD834\\uDDB9 </pattern>
<pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
</test-case>
<test-case
id=
"test53" strength=
"TERTIARY">
<pattern>-\\uD834\\uDDB9-</pattern>
<pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
</test-case>
<test-case
id=
"test54" strength=
"TERTIARY">
<pattern>,\\uD834\\uDDB9,</pattern>
<pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
</test-case>
<test-case
id=
"test55" strength=
"TERTIARY">
<pattern>?\\uD834\\uDDB9?</pattern>
<pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
</test-case>
<!-- Long combining sequences -->
<!-- Backwards search fails because patterns ends w/ ignorables
<test-case id="test60" strength="PRIMARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<m>A\u0301\u0301\u0301\u0301\u0301</m>
</test-case>
-->
<test-case
id=
"test61" strength=
"TERTIARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<pre>A\u0301\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case
id=
"test62" strength=
"TERTIARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<m>A\u0301\u0301\u0301\u0301</m>
</test-case>
<!-- stand-alone combining marks don't match attached marks -->
<test-case
id=
"test63" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<pre>A\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case
id=
"test64" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<post>\u0301\u0301\u0301\u0301</post>
</test-case>
<!-- stand-alone combining mark does match an un-attached combining mark -->
<test-case
id=
"test65" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<m>\u0301</m><post>A\u0301\u0301</post>
</test-case>
<test-case
id=
"test66" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<m>\u0301</m>
</test-case>
<!-- stand-alone combining marks at end of the target text -->
<test-case
id=
"test67" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<pre>abcd\r</pre><m>\u0301</m>
</test-case>
<!-- attached combining marks at end of the target text, no match -->
<test-case
id=
"test68" strength=
"TERTIARY">
<pattern>\u0301</pattern>
<pre>abcd\u0301</pre>
</test-case>
<!-- no match within expansions at the start -->
<test-case
id=
"test70" strength=
"PRIMARY">
<pattern>Eligature</pattern>
<pre>Æligature</pre>
</test-case>
<test-case
id=
"test71" strength=
"PRIMARY">
<pattern>AEligature</pattern>
<m>Æligature</m>
</test-case>
<test-case
id=
"test72" strength=
"PRIMARY">
<pattern>AEligature</pattern>
<m>Æligature</m>
</test-case>
<!-- unattached combining Tilde will not match a Tilde that is
part of a composed Ñ (\u00D1) -->
<test-case
id=
"test73" strength=
"SECONDARY">
<pattern>\u0303</pattern>
<!-- combining tilde -->
<pre>Ñ
</pre><m>\u0303</m>
</test-case>
<test-case
id=
"test74" strength=
"SECONDARY">
<pattern>\u0303</pattern>
<!-- combining tilde -->
<pre>Ñ
</pre><m>\u0303</m><post>a</post>
</test-case>
<test-case
id=
"test75" strength=
"TERTIARY" locale=
"fr">
<pattern>\u00EA</pattern>
<pre>p</pre><m>\u00EA</m><post>che</post>
</test-case>
<test-case
id=
"test76" strength=
"TERTIARY" locale=
"fr">
<pattern>\u00EA</pattern>
<pre>p</pre><m>e\u0302</m><post>che</post>
</test-case>
<test-case
id=
"test77" strength=
"TERTIARY" locale=
"fr">
<pattern>e\u0302</pattern>
<pre>p</pre><m>\u00EA</m><post>che</post>
</test-case>
<!-- Test cases from ticket:5382 -->
<test-case
id=
"test78" strength=
"SECONDARY" locale=
"hu_HU">
<pattern>\u0170</pattern>
<m>\u0171</m>
<post>12</post>
</test-case>
<test-case
id=
"test79" strength=
"SECONDARY" locale=
"hu_HU">
<pattern>\u0170</pattern>
<pre>1</pre>
<m>\u0171</m>
<post>2</post>
</test-case>
<test-case
id=
"test80" strength=
"SECONDARY" locale=
"hu_HU">
<pattern>\u0170</pattern>
<pre>12</pre>
<m>\u0171</m>
</test-case>
<!-- Test cases from ticket:5959 -->
<test-case
id=
"test81" strength=
"SECONDARY">
<pattern>\u2166</pattern>
<m>VII</m>
</test-case>
<test-case
id=
"test82" strength=
"SECONDARY">
<pattern>VII</pattern>
<m>\u2166</m>
</test-case>
<test-case
id=
"test83" strength=
"IDENTICAL" alternate_handling=
"SHIFTED" locale=
"en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of a
chievement for all peoples and all nations</post>
</test-case>
<test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre>
<m>Universal-Declaration-of-Human-Rights</m>
<post> as a common standard of achievement for all peoples and all nations</post>
</test-case>
<test-case id="test84" strength="TERTIARY" locale="en">
<pattern>\u05E9\u0591\u05E9</pattern>
<m>\u05E9\u0592\u05E9</m>
</test-case>
<test-case id="test84b" strength="IDENTICAL" locale="en">
<pattern>\u05E9\u0591\u05E9</pattern>
<pre>\u05E9\u0592\u05E9</pre>
</test-case>
</stringsearch-tests>