12e5b6d6dSopenharmony_ci# Copyright (C) 2016 and later: Unicode, Inc. and others.
22e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
32e5b6d6dSopenharmony_ci# Copyright (c) 2002-2016  International Business Machines Corporation and
42e5b6d6dSopenharmony_ci# others. All Rights Reserved.
52e5b6d6dSopenharmony_ci#
62e5b6d6dSopenharmony_ci#  file:  line.txt
72e5b6d6dSopenharmony_ci#
82e5b6d6dSopenharmony_ci#         Line Breaking Rules
92e5b6d6dSopenharmony_ci#         Implement default line breaking as defined by
102e5b6d6dSopenharmony_ci#         Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
112e5b6d6dSopenharmony_ci#         for Unicode 14.0, with the following modification:
122e5b6d6dSopenharmony_ci#
132e5b6d6dSopenharmony_ci#         Boundaries between hyphens and following letters are suppressed when
142e5b6d6dSopenharmony_ci#         there is a boundary preceding the hyphen. See rule 20.9
152e5b6d6dSopenharmony_ci#
162e5b6d6dSopenharmony_ci#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
172e5b6d6dSopenharmony_ci#         It sets characters of class CJ to behave like NS.
182e5b6d6dSopenharmony_ci
192e5b6d6dSopenharmony_ci#
202e5b6d6dSopenharmony_ci#  Character Classes defined by TR 14.
212e5b6d6dSopenharmony_ci#
222e5b6d6dSopenharmony_ci
232e5b6d6dSopenharmony_ci!!chain;
242e5b6d6dSopenharmony_ci!!quoted_literals_only;
252e5b6d6dSopenharmony_ci
262e5b6d6dSopenharmony_ci$AI = [:LineBreak =  Ambiguous:];
272e5b6d6dSopenharmony_ci$AL = [:LineBreak =  Alphabetic:];
282e5b6d6dSopenharmony_ci$BA = [:LineBreak =  Break_After:];
292e5b6d6dSopenharmony_ci$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
302e5b6d6dSopenharmony_ci$BB = [:LineBreak =  Break_Before:];
312e5b6d6dSopenharmony_ci$BK = [:LineBreak =  Mandatory_Break:];
322e5b6d6dSopenharmony_ci$B2 = [:LineBreak =  Break_Both:];
332e5b6d6dSopenharmony_ci$CB = [:LineBreak =  Contingent_Break:];
342e5b6d6dSopenharmony_ci$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
352e5b6d6dSopenharmony_ci$CL = [:LineBreak =  Close_Punctuation:];
362e5b6d6dSopenharmony_ci# $CM = [:LineBreak =  Combining_Mark:];
372e5b6d6dSopenharmony_ci$CP = [:LineBreak =  Close_Parenthesis:];
382e5b6d6dSopenharmony_ci$CR = [:LineBreak =  Carriage_Return:];
392e5b6d6dSopenharmony_ci$EB = [:LineBreak =  EB:];
402e5b6d6dSopenharmony_ci$EM = [:LineBreak =  EM:];
412e5b6d6dSopenharmony_ci$EX = [:LineBreak =  Exclamation:];
422e5b6d6dSopenharmony_ci$GL = [:LineBreak =  Glue:];
432e5b6d6dSopenharmony_ci$HL = [:LineBreak =  Hebrew_Letter:];
442e5b6d6dSopenharmony_ci$HY = [:LineBreak =  Hyphen:];
452e5b6d6dSopenharmony_ci$H2 = [:LineBreak =  H2:];
462e5b6d6dSopenharmony_ci$H3 = [:LineBreak =  H3:];
472e5b6d6dSopenharmony_ci$ID = [:LineBreak =  Ideographic:];
482e5b6d6dSopenharmony_ci$IN = [:LineBreak =  Inseperable:];
492e5b6d6dSopenharmony_ci$IS = [:LineBreak =  Infix_Numeric:];
502e5b6d6dSopenharmony_ci$JL = [:LineBreak =  JL:];
512e5b6d6dSopenharmony_ci$JV = [:LineBreak =  JV:];
522e5b6d6dSopenharmony_ci$JT = [:LineBreak =  JT:];
532e5b6d6dSopenharmony_ci$LF = [:LineBreak =  Line_Feed:];
542e5b6d6dSopenharmony_ci$NL = [:LineBreak =  Next_Line:];
552e5b6d6dSopenharmony_ci# NS includes CJ for CSS strict line breaking.
562e5b6d6dSopenharmony_ci$NS = [[:LineBreak =  Nonstarter:] $CJ];
572e5b6d6dSopenharmony_ci$NU = [:LineBreak =  Numeric:];
582e5b6d6dSopenharmony_ci$OP = [:LineBreak =  Open_Punctuation:];
592e5b6d6dSopenharmony_ci$PO = [:LineBreak =  Postfix_Numeric:];
602e5b6d6dSopenharmony_ci$PR = [:LineBreak =  Prefix_Numeric:];
612e5b6d6dSopenharmony_ci$QU = [:LineBreak =  Quotation:];
622e5b6d6dSopenharmony_ci$RI = [:LineBreak =  Regional_Indicator:];
632e5b6d6dSopenharmony_ci$SA = [:LineBreak =  Complex_Context:];
642e5b6d6dSopenharmony_ci$SG = [:LineBreak =  Surrogate:];
652e5b6d6dSopenharmony_ci$SP = [:LineBreak =  Space:];
662e5b6d6dSopenharmony_ci$SY = [:LineBreak =  Break_Symbols:];
672e5b6d6dSopenharmony_ci$WJ = [:LineBreak =  Word_Joiner:];
682e5b6d6dSopenharmony_ci$XX = [:LineBreak =  Unknown:];
692e5b6d6dSopenharmony_ci$ZW = [:LineBreak =  ZWSpace:];
702e5b6d6dSopenharmony_ci$ZWJ = [:LineBreak = ZWJ:];
712e5b6d6dSopenharmony_ci
722e5b6d6dSopenharmony_ci# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
732e5b6d6dSopenharmony_ci# without a formal name. Because ICU rules require multiple uses of the expressions,
742e5b6d6dSopenharmony_ci# give them a single definition with a name
752e5b6d6dSopenharmony_ci
762e5b6d6dSopenharmony_ci$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
772e5b6d6dSopenharmony_ci$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
782e5b6d6dSopenharmony_ci
792e5b6d6dSopenharmony_ci$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
802e5b6d6dSopenharmony_ci
812e5b6d6dSopenharmony_ci# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
822e5b6d6dSopenharmony_ci#         list it in the numerous rules that use CM.
832e5b6d6dSopenharmony_ci# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
842e5b6d6dSopenharmony_ci
852e5b6d6dSopenharmony_ci$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
862e5b6d6dSopenharmony_ci$CMX = [[$CM] - [$ZWJ]];
872e5b6d6dSopenharmony_ci
882e5b6d6dSopenharmony_ci#   Dictionary character set, for triggering language-based break engines. Currently
892e5b6d6dSopenharmony_ci#   limited to LineBreak=Complex_Context (SA).
902e5b6d6dSopenharmony_ci
912e5b6d6dSopenharmony_ci$dictionary = [$SA];
922e5b6d6dSopenharmony_ci
932e5b6d6dSopenharmony_ci#
942e5b6d6dSopenharmony_ci#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
952e5b6d6dSopenharmony_ci#                               SA  (Dictionary chars, excluding Mn and Mc)
962e5b6d6dSopenharmony_ci#                               SG  (Unpaired Surrogates)
972e5b6d6dSopenharmony_ci#                               XX  (Unknown, unassigned)
982e5b6d6dSopenharmony_ci#                         as $AL  (Alphabetic)
992e5b6d6dSopenharmony_ci#
1002e5b6d6dSopenharmony_ci$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
1012e5b6d6dSopenharmony_ci
1022e5b6d6dSopenharmony_ci
1032e5b6d6dSopenharmony_ci## -------------------------------------------------
1042e5b6d6dSopenharmony_ci
1052e5b6d6dSopenharmony_ci#
1062e5b6d6dSopenharmony_ci# CAN_CM  is the set of characters that may combine with CM combining chars.
1072e5b6d6dSopenharmony_ci#         Note that Linebreak UAX 14's concept of a combining char and the rules
1082e5b6d6dSopenharmony_ci#         for what they can combine with are _very_ different from the rest of Unicode.
1092e5b6d6dSopenharmony_ci#
1102e5b6d6dSopenharmony_ci#         Note that $CM itself is left out of this set.  If CM is needed as a base
1112e5b6d6dSopenharmony_ci#         it must be listed separately in the rule.
1122e5b6d6dSopenharmony_ci#
1132e5b6d6dSopenharmony_ci$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
1142e5b6d6dSopenharmony_ci$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
1152e5b6d6dSopenharmony_ci
1162e5b6d6dSopenharmony_ci#
1172e5b6d6dSopenharmony_ci# AL_FOLLOW  set of chars that can unconditionally follow an AL
1182e5b6d6dSopenharmony_ci#            Needed in rules where stand-alone $CM s are treated as AL.
1192e5b6d6dSopenharmony_ci#
1202e5b6d6dSopenharmony_ci$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
1212e5b6d6dSopenharmony_ci
1222e5b6d6dSopenharmony_ci
1232e5b6d6dSopenharmony_ci#
1242e5b6d6dSopenharmony_ci#  Rule LB 4, 5    Mandatory (Hard) breaks.
1252e5b6d6dSopenharmony_ci#
1262e5b6d6dSopenharmony_ci$LB4Breaks    = [$BK $CR $LF $NL];
1272e5b6d6dSopenharmony_ci$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
1282e5b6d6dSopenharmony_ci$CR $LF {100};
1292e5b6d6dSopenharmony_ci
1302e5b6d6dSopenharmony_ci#
1312e5b6d6dSopenharmony_ci#  LB 6    Do not break before hard line breaks.
1322e5b6d6dSopenharmony_ci#
1332e5b6d6dSopenharmony_ci$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
1342e5b6d6dSopenharmony_ci$CAN_CM $CM*    $LB4Breaks {100};
1352e5b6d6dSopenharmony_ci^$CM+           $LB4Breaks {100};
1362e5b6d6dSopenharmony_ci
1372e5b6d6dSopenharmony_ci# LB 7         x SP
1382e5b6d6dSopenharmony_ci#              x ZW
1392e5b6d6dSopenharmony_ci$LB4NonBreaks [$SP $ZW];
1402e5b6d6dSopenharmony_ci$CAN_CM $CM*  [$SP $ZW];
1412e5b6d6dSopenharmony_ci^$CM+         [$SP $ZW];
1422e5b6d6dSopenharmony_ci
1432e5b6d6dSopenharmony_ci#
1442e5b6d6dSopenharmony_ci# LB 8         Break after zero width space
1452e5b6d6dSopenharmony_ci#              ZW SP* ÷
1462e5b6d6dSopenharmony_ci#
1472e5b6d6dSopenharmony_ci$LB8Breaks    = [$LB4Breaks $ZW];
1482e5b6d6dSopenharmony_ci$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
1492e5b6d6dSopenharmony_ci$ZW $SP* / [^$SP $ZW $LB4Breaks];
1502e5b6d6dSopenharmony_ci
1512e5b6d6dSopenharmony_ci# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
1522e5b6d6dSopenharmony_ci#
1532e5b6d6dSopenharmony_ci$ZWJ [^$CM];
1542e5b6d6dSopenharmony_ci
1552e5b6d6dSopenharmony_ci# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
1562e5b6d6dSopenharmony_ci#                                $CM not covered by the above needs to behave like $AL
1572e5b6d6dSopenharmony_ci#                                See definition of $CAN_CM.
1582e5b6d6dSopenharmony_ci
1592e5b6d6dSopenharmony_ci$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
1602e5b6d6dSopenharmony_ci^$CM+;
1612e5b6d6dSopenharmony_ci
1622e5b6d6dSopenharmony_ci#
1632e5b6d6dSopenharmony_ci# LB 11  Do not break before or after WORD JOINER & related characters.
1642e5b6d6dSopenharmony_ci#
1652e5b6d6dSopenharmony_ci$CAN_CM $CM*  $WJ;
1662e5b6d6dSopenharmony_ci$LB8NonBreaks $WJ;
1672e5b6d6dSopenharmony_ci^$CM+         $WJ;
1682e5b6d6dSopenharmony_ci
1692e5b6d6dSopenharmony_ci$WJ $CM* .;
1702e5b6d6dSopenharmony_ci
1712e5b6d6dSopenharmony_ci#
1722e5b6d6dSopenharmony_ci# LB 12  Do not break after NBSP and related characters.
1732e5b6d6dSopenharmony_ci#         GL  x
1742e5b6d6dSopenharmony_ci#
1752e5b6d6dSopenharmony_ci$GL $CM* .;
1762e5b6d6dSopenharmony_ci
1772e5b6d6dSopenharmony_ci#
1782e5b6d6dSopenharmony_ci# LB 12a  Do not break before NBSP and related characters ...
1792e5b6d6dSopenharmony_ci#            [^SP BA HY] x GL
1802e5b6d6dSopenharmony_ci#
1812e5b6d6dSopenharmony_ci[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
1822e5b6d6dSopenharmony_ci^$CM+ $GL;
1832e5b6d6dSopenharmony_ci
1842e5b6d6dSopenharmony_ci
1852e5b6d6dSopenharmony_ci
1862e5b6d6dSopenharmony_ci
1872e5b6d6dSopenharmony_ci# LB 13   Don't break before ']' or '!' or '/', even after spaces.
1882e5b6d6dSopenharmony_ci#
1892e5b6d6dSopenharmony_ci$LB8NonBreaks $CL;
1902e5b6d6dSopenharmony_ci$CAN_CM $CM*  $CL;
1912e5b6d6dSopenharmony_ci^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
1922e5b6d6dSopenharmony_ci
1932e5b6d6dSopenharmony_ci$LB8NonBreaks $CP;
1942e5b6d6dSopenharmony_ci$CAN_CM $CM*  $CP;
1952e5b6d6dSopenharmony_ci^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
1962e5b6d6dSopenharmony_ci
1972e5b6d6dSopenharmony_ci$LB8NonBreaks $EX;
1982e5b6d6dSopenharmony_ci$CAN_CM $CM*  $EX;
1992e5b6d6dSopenharmony_ci^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
2002e5b6d6dSopenharmony_ci
2012e5b6d6dSopenharmony_ci$LB8NonBreaks $SY;
2022e5b6d6dSopenharmony_ci$CAN_CM $CM*  $SY;
2032e5b6d6dSopenharmony_ci^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
2042e5b6d6dSopenharmony_ci
2052e5b6d6dSopenharmony_ci
2062e5b6d6dSopenharmony_ci#
2072e5b6d6dSopenharmony_ci# LB 14  Do not break after OP, even after spaces
2082e5b6d6dSopenharmony_ci#        Note subtle interaction with "SP IS /" rules in LB14a.
2092e5b6d6dSopenharmony_ci#        This rule consumes the SP, chaining happens on the IS, effectivley overriding the  SP IS rules,
2102e5b6d6dSopenharmony_ci#        which is the desired behavior.
2112e5b6d6dSopenharmony_ci#
2122e5b6d6dSopenharmony_ci$OP $CM* $SP* .;
2132e5b6d6dSopenharmony_ci
2142e5b6d6dSopenharmony_ci$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
2152e5b6d6dSopenharmony_ci                                   # by rule 8, CM following a SP is stand-alone.
2162e5b6d6dSopenharmony_ci
2172e5b6d6dSopenharmony_ci
2182e5b6d6dSopenharmony_ci# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
2192e5b6d6dSopenharmony_ci#        Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
2202e5b6d6dSopenharmony_ci#        See issue ICU-20303
2212e5b6d6dSopenharmony_ci
2222e5b6d6dSopenharmony_ci
2232e5b6d6dSopenharmony_ci$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
2242e5b6d6dSopenharmony_ci$SP $IS           / [^ $CanFollowIS $NU $CM];
2252e5b6d6dSopenharmony_ci$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
2262e5b6d6dSopenharmony_ci
2272e5b6d6dSopenharmony_ci#
2282e5b6d6dSopenharmony_ci# LB 14b Do not break before numeric separators (IS), even after spaces.
2292e5b6d6dSopenharmony_ci
2302e5b6d6dSopenharmony_ci[$LB8NonBreaks - $SP] $IS;
2312e5b6d6dSopenharmony_ci$SP $IS $CM* [$CanFollowIS {eof}];
2322e5b6d6dSopenharmony_ci$SP $IS $CM* $ZWJ [^$CM $NU];
2332e5b6d6dSopenharmony_ci
2342e5b6d6dSopenharmony_ci$CAN_CM $CM*  $IS;
2352e5b6d6dSopenharmony_ci^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
2362e5b6d6dSopenharmony_ci
2372e5b6d6dSopenharmony_ci
2382e5b6d6dSopenharmony_ci# LB 15
2392e5b6d6dSopenharmony_ci$QU $CM* $SP* $OP;
2402e5b6d6dSopenharmony_ci
2412e5b6d6dSopenharmony_ci# LB 16
2422e5b6d6dSopenharmony_ci($CL | $CP) $CM* $SP* $NS;
2432e5b6d6dSopenharmony_ci
2442e5b6d6dSopenharmony_ci# LB 17
2452e5b6d6dSopenharmony_ci$B2 $CM* $SP* $B2;
2462e5b6d6dSopenharmony_ci
2472e5b6d6dSopenharmony_ci#
2482e5b6d6dSopenharmony_ci# LB 18  Break after spaces.
2492e5b6d6dSopenharmony_ci#
2502e5b6d6dSopenharmony_ci$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
2512e5b6d6dSopenharmony_ci$LB18Breaks    = [$LB8Breaks $SP];
2522e5b6d6dSopenharmony_ci
2532e5b6d6dSopenharmony_ci
2542e5b6d6dSopenharmony_ci# LB 19
2552e5b6d6dSopenharmony_ci#         x QU
2562e5b6d6dSopenharmony_ci$LB18NonBreaks $CM* $QU;
2572e5b6d6dSopenharmony_ci^$CM+               $QU;
2582e5b6d6dSopenharmony_ci
2592e5b6d6dSopenharmony_ci#         QU  x
2602e5b6d6dSopenharmony_ci$QU $CM* .;
2612e5b6d6dSopenharmony_ci
2622e5b6d6dSopenharmony_ci# LB 20
2632e5b6d6dSopenharmony_ci#        <break>  $CB
2642e5b6d6dSopenharmony_ci#        $CB   <break>
2652e5b6d6dSopenharmony_ci#
2662e5b6d6dSopenharmony_ci$LB20NonBreaks = [$LB18NonBreaks - $CB];
2672e5b6d6dSopenharmony_ci
2682e5b6d6dSopenharmony_ci# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
2692e5b6d6dSopenharmony_ci#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
2702e5b6d6dSopenharmony_ci#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
2712e5b6d6dSopenharmony_ci#
2722e5b6d6dSopenharmony_ci^($HY | $HH) $CM* $ALPlus;
2732e5b6d6dSopenharmony_ci
2742e5b6d6dSopenharmony_ci# LB 21        x   (BA | HY | NS)
2752e5b6d6dSopenharmony_ci#           BB x
2762e5b6d6dSopenharmony_ci#
2772e5b6d6dSopenharmony_ci$LB20NonBreaks $CM* ($BA | $HY | $NS);
2782e5b6d6dSopenharmony_ci
2792e5b6d6dSopenharmony_ci
2802e5b6d6dSopenharmony_ci^$CM+ ($BA | $HY | $NS);
2812e5b6d6dSopenharmony_ci
2822e5b6d6dSopenharmony_ci$BB $CM* [^$CB];                                  #  $BB  x
2832e5b6d6dSopenharmony_ci$BB $CM* $LB20NonBreaks;
2842e5b6d6dSopenharmony_ci
2852e5b6d6dSopenharmony_ci# LB 21a Don't break after Hebrew + Hyphen
2862e5b6d6dSopenharmony_ci#   HL (HY | BA) x
2872e5b6d6dSopenharmony_ci#
2882e5b6d6dSopenharmony_ci$HL $CM* ($HY | $BA) $CM* [^$CB]?;
2892e5b6d6dSopenharmony_ci
2902e5b6d6dSopenharmony_ci# LB 21b (forward) Don't break between SY and HL
2912e5b6d6dSopenharmony_ci# (break between HL and SY already disallowed by LB 13 above)
2922e5b6d6dSopenharmony_ci$SY $CM* $HL;
2932e5b6d6dSopenharmony_ci
2942e5b6d6dSopenharmony_ci# LB 22  Do not break before ellipses
2952e5b6d6dSopenharmony_ci#
2962e5b6d6dSopenharmony_ci$LB20NonBreaks $CM*    $IN;
2972e5b6d6dSopenharmony_ci^$CM+ $IN;
2982e5b6d6dSopenharmony_ci
2992e5b6d6dSopenharmony_ci
3002e5b6d6dSopenharmony_ci# LB 23
3012e5b6d6dSopenharmony_ci#
3022e5b6d6dSopenharmony_ci($ALPlus | $HL) $CM* $NU;
3032e5b6d6dSopenharmony_ci^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
3042e5b6d6dSopenharmony_ci$NU $CM* ($ALPlus | $HL);
3052e5b6d6dSopenharmony_ci
3062e5b6d6dSopenharmony_ci# LB 23a
3072e5b6d6dSopenharmony_ci#
3082e5b6d6dSopenharmony_ci$PR $CM* ($ID | $EB | $EM);
3092e5b6d6dSopenharmony_ci($ID | $EB | $EM) $CM*  $PO;
3102e5b6d6dSopenharmony_ci
3112e5b6d6dSopenharmony_ci
3122e5b6d6dSopenharmony_ci#
3132e5b6d6dSopenharmony_ci# LB 24
3142e5b6d6dSopenharmony_ci#
3152e5b6d6dSopenharmony_ci($PR | $PO) $CM* ($ALPlus | $HL);
3162e5b6d6dSopenharmony_ci($ALPlus | $HL) $CM* ($PR | $PO);
3172e5b6d6dSopenharmony_ci^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
3182e5b6d6dSopenharmony_ci
3192e5b6d6dSopenharmony_ci#
3202e5b6d6dSopenharmony_ci# LB 25   Numbers.
3212e5b6d6dSopenharmony_ci#
3222e5b6d6dSopenharmony_ci(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
3232e5b6d6dSopenharmony_ci    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
3242e5b6d6dSopenharmony_ci
3252e5b6d6dSopenharmony_ci# LB 26  Do not break a Korean syllable
3262e5b6d6dSopenharmony_ci#
3272e5b6d6dSopenharmony_ci$JL $CM* ($JL | $JV | $H2 | $H3);
3282e5b6d6dSopenharmony_ci($JV | $H2) $CM* ($JV | $JT);
3292e5b6d6dSopenharmony_ci($JT | $H3) $CM* $JT;
3302e5b6d6dSopenharmony_ci
3312e5b6d6dSopenharmony_ci# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
3322e5b6d6dSopenharmony_ci($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
3332e5b6d6dSopenharmony_ci$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
3342e5b6d6dSopenharmony_ci
3352e5b6d6dSopenharmony_ci
3362e5b6d6dSopenharmony_ci# LB 28   Do not break between alphabetics
3372e5b6d6dSopenharmony_ci#
3382e5b6d6dSopenharmony_ci($ALPlus | $HL) $CM* ($ALPlus | $HL);
3392e5b6d6dSopenharmony_ci^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
3402e5b6d6dSopenharmony_ci
3412e5b6d6dSopenharmony_ci# LB 29
3422e5b6d6dSopenharmony_ci$IS $CM* ($ALPlus | $HL);
3432e5b6d6dSopenharmony_ci
3442e5b6d6dSopenharmony_ci# LB 30
3452e5b6d6dSopenharmony_ci($ALPlus | $HL | $NU) $CM* $OP30;
3462e5b6d6dSopenharmony_ci^$CM+ $OP30;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
3472e5b6d6dSopenharmony_ci$CP30 $CM* ($ALPlus | $HL | $NU);
3482e5b6d6dSopenharmony_ci
3492e5b6d6dSopenharmony_ci# LB 30a  Do not break between regional indicators. Break after pairs of them.
3502e5b6d6dSopenharmony_ci#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
3512e5b6d6dSopenharmony_ci$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
3522e5b6d6dSopenharmony_ci$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
3532e5b6d6dSopenharmony_ci$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
3542e5b6d6dSopenharmony_ci# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
3552e5b6d6dSopenharmony_ci#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
3562e5b6d6dSopenharmony_ci#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
3572e5b6d6dSopenharmony_ci
3582e5b6d6dSopenharmony_ci# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3592e5b6d6dSopenharmony_ci$EB $CM* $EM;
3602e5b6d6dSopenharmony_ci$ExtPictUnassigned $CM* $EM;
3612e5b6d6dSopenharmony_ci
3622e5b6d6dSopenharmony_ci# LB 31 Break everywhere else.
3632e5b6d6dSopenharmony_ci#       Match a single code point if no other rule applies.
3642e5b6d6dSopenharmony_ci.;
365