1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2010-2011, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* file name: bytestrie.cpp 9* encoding: UTF-8 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2010sep25 14* created by: Markus W. Scherer 15*/ 16 17#include "unicode/utypes.h" 18#include "unicode/bytestream.h" 19#include "unicode/bytestrie.h" 20#include "unicode/uobject.h" 21#include "cmemory.h" 22#include "uassert.h" 23 24U_NAMESPACE_BEGIN 25 26BytesTrie::~BytesTrie() { 27 uprv_free(ownedArray_); 28} 29 30// lead byte already shifted right by 1. 31int32_t 32BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) { 33 int32_t value; 34 if(leadByte<kMinTwoByteValueLead) { 35 value=leadByte-kMinOneByteValueLead; 36 } else if(leadByte<kMinThreeByteValueLead) { 37 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos; 38 } else if(leadByte<kFourByteValueLead) { 39 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 40 } else if(leadByte==kFourByteValueLead) { 41 value=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 42 } else { 43 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 44 } 45 return value; 46} 47 48const uint8_t * 49BytesTrie::jumpByDelta(const uint8_t *pos) { 50 int32_t delta=*pos++; 51 if(delta<kMinTwoByteDeltaLead) { 52 // nothing to do 53 } else if(delta<kMinThreeByteDeltaLead) { 54 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++; 55 } else if(delta<kFourByteDeltaLead) { 56 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1]; 57 pos+=2; 58 } else if(delta==kFourByteDeltaLead) { 59 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 60 pos+=3; 61 } else { 62 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 63 pos+=4; 64 } 65 return pos+delta; 66} 67 68UStringTrieResult 69BytesTrie::current() const { 70 const uint8_t *pos=pos_; 71 if(pos==nullptr) { 72 return USTRINGTRIE_NO_MATCH; 73 } else { 74 int32_t node; 75 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 76 valueResult(node) : USTRINGTRIE_NO_VALUE; 77 } 78} 79 80UStringTrieResult 81BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) { 82 // Branch according to the current byte. 83 if(length==0) { 84 length=*pos++; 85 } 86 ++length; 87 // The length of the branch is the number of bytes to select from. 88 // The data structure encodes a binary search. 89 while(length>kMaxBranchLinearSubNodeLength) { 90 if(inByte<*pos++) { 91 length>>=1; 92 pos=jumpByDelta(pos); 93 } else { 94 length=length-(length>>1); 95 pos=skipDelta(pos); 96 } 97 } 98 // Drop down to linear search for the last few bytes. 99 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 100 // and divides length by 2. 101 do { 102 if(inByte==*pos++) { 103 UStringTrieResult result; 104 int32_t node=*pos; 105 U_ASSERT(node>=kMinValueLead); 106 if(node&kValueIsFinal) { 107 // Leave the final value for getValue() to read. 108 result=USTRINGTRIE_FINAL_VALUE; 109 } else { 110 // Use the non-final value as the jump delta. 111 ++pos; 112 // int32_t delta=readValue(pos, node>>1); 113 node>>=1; 114 int32_t delta; 115 if(node<kMinTwoByteValueLead) { 116 delta=node-kMinOneByteValueLead; 117 } else if(node<kMinThreeByteValueLead) { 118 delta=((node-kMinTwoByteValueLead)<<8)|*pos++; 119 } else if(node<kFourByteValueLead) { 120 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 121 pos+=2; 122 } else if(node==kFourByteValueLead) { 123 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 124 pos+=3; 125 } else { 126 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 127 pos+=4; 128 } 129 // end readValue() 130 pos+=delta; 131 node=*pos; 132 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 133 } 134 pos_=pos; 135 return result; 136 } 137 --length; 138 pos=skipValue(pos); 139 } while(length>1); 140 if(inByte==*pos++) { 141 pos_=pos; 142 int32_t node=*pos; 143 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 144 } else { 145 stop(); 146 return USTRINGTRIE_NO_MATCH; 147 } 148} 149 150UStringTrieResult 151BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) { 152 for(;;) { 153 int32_t node=*pos++; 154 if(node<kMinLinearMatch) { 155 return branchNext(pos, node, inByte); 156 } else if(node<kMinValueLead) { 157 // Match the first of length+1 bytes. 158 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 159 if(inByte==*pos++) { 160 remainingMatchLength_=--length; 161 pos_=pos; 162 return (length<0 && (node=*pos)>=kMinValueLead) ? 163 valueResult(node) : USTRINGTRIE_NO_VALUE; 164 } else { 165 // No match. 166 break; 167 } 168 } else if(node&kValueIsFinal) { 169 // No further matching bytes. 170 break; 171 } else { 172 // Skip intermediate value. 173 pos=skipValue(pos, node); 174 // The next node must not also be a value node. 175 U_ASSERT(*pos<kMinValueLead); 176 } 177 } 178 stop(); 179 return USTRINGTRIE_NO_MATCH; 180} 181 182UStringTrieResult 183BytesTrie::next(int32_t inByte) { 184 const uint8_t *pos=pos_; 185 if(pos==nullptr) { 186 return USTRINGTRIE_NO_MATCH; 187 } 188 if(inByte<0) { 189 inByte+=0x100; 190 } 191 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 192 if(length>=0) { 193 // Remaining part of a linear-match node. 194 if(inByte==*pos++) { 195 remainingMatchLength_=--length; 196 pos_=pos; 197 int32_t node; 198 return (length<0 && (node=*pos)>=kMinValueLead) ? 199 valueResult(node) : USTRINGTRIE_NO_VALUE; 200 } else { 201 stop(); 202 return USTRINGTRIE_NO_MATCH; 203 } 204 } 205 return nextImpl(pos, inByte); 206} 207 208UStringTrieResult 209BytesTrie::next(const char *s, int32_t sLength) { 210 if(sLength<0 ? *s==0 : sLength==0) { 211 // Empty input. 212 return current(); 213 } 214 const uint8_t *pos=pos_; 215 if(pos==nullptr) { 216 return USTRINGTRIE_NO_MATCH; 217 } 218 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 219 for(;;) { 220 // Fetch the next input byte, if there is one. 221 // Continue a linear-match node without rechecking sLength<0. 222 int32_t inByte; 223 if(sLength<0) { 224 for(;;) { 225 if((inByte=*s++)==0) { 226 remainingMatchLength_=length; 227 pos_=pos; 228 int32_t node; 229 return (length<0 && (node=*pos)>=kMinValueLead) ? 230 valueResult(node) : USTRINGTRIE_NO_VALUE; 231 } 232 if(length<0) { 233 remainingMatchLength_=length; 234 break; 235 } 236 if(inByte!=*pos) { 237 stop(); 238 return USTRINGTRIE_NO_MATCH; 239 } 240 ++pos; 241 --length; 242 } 243 } else { 244 for(;;) { 245 if(sLength==0) { 246 remainingMatchLength_=length; 247 pos_=pos; 248 int32_t node; 249 return (length<0 && (node=*pos)>=kMinValueLead) ? 250 valueResult(node) : USTRINGTRIE_NO_VALUE; 251 } 252 inByte=*s++; 253 --sLength; 254 if(length<0) { 255 remainingMatchLength_=length; 256 break; 257 } 258 if(inByte!=*pos) { 259 stop(); 260 return USTRINGTRIE_NO_MATCH; 261 } 262 ++pos; 263 --length; 264 } 265 } 266 for(;;) { 267 int32_t node=*pos++; 268 if(node<kMinLinearMatch) { 269 UStringTrieResult result=branchNext(pos, node, inByte); 270 if(result==USTRINGTRIE_NO_MATCH) { 271 return USTRINGTRIE_NO_MATCH; 272 } 273 // Fetch the next input byte, if there is one. 274 if(sLength<0) { 275 if((inByte=*s++)==0) { 276 return result; 277 } 278 } else { 279 if(sLength==0) { 280 return result; 281 } 282 inByte=*s++; 283 --sLength; 284 } 285 if(result==USTRINGTRIE_FINAL_VALUE) { 286 // No further matching bytes. 287 stop(); 288 return USTRINGTRIE_NO_MATCH; 289 } 290 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 291 } else if(node<kMinValueLead) { 292 // Match length+1 bytes. 293 length=node-kMinLinearMatch; // Actual match length minus 1. 294 if(inByte!=*pos) { 295 stop(); 296 return USTRINGTRIE_NO_MATCH; 297 } 298 ++pos; 299 --length; 300 break; 301 } else if(node&kValueIsFinal) { 302 // No further matching bytes. 303 stop(); 304 return USTRINGTRIE_NO_MATCH; 305 } else { 306 // Skip intermediate value. 307 pos=skipValue(pos, node); 308 // The next node must not also be a value node. 309 U_ASSERT(*pos<kMinValueLead); 310 } 311 } 312 } 313} 314 315const uint8_t * 316BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 317 UBool haveUniqueValue, int32_t &uniqueValue) { 318 while(length>kMaxBranchLinearSubNodeLength) { 319 ++pos; // ignore the comparison byte 320 if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 321 return nullptr; 322 } 323 length=length-(length>>1); 324 pos=skipDelta(pos); 325 } 326 do { 327 ++pos; // ignore a comparison byte 328 // handle its value 329 int32_t node=*pos++; 330 UBool isFinal=(UBool)(node&kValueIsFinal); 331 int32_t value=readValue(pos, node>>1); 332 pos=skipValue(pos, node); 333 if(isFinal) { 334 if(haveUniqueValue) { 335 if(value!=uniqueValue) { 336 return nullptr; 337 } 338 } else { 339 uniqueValue=value; 340 haveUniqueValue=true; 341 } 342 } else { 343 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 344 return nullptr; 345 } 346 haveUniqueValue=true; 347 } 348 } while(--length>1); 349 return pos+1; // ignore the last comparison byte 350} 351 352UBool 353BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 354 for(;;) { 355 int32_t node=*pos++; 356 if(node<kMinLinearMatch) { 357 if(node==0) { 358 node=*pos++; 359 } 360 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 361 if(pos==nullptr) { 362 return false; 363 } 364 haveUniqueValue=true; 365 } else if(node<kMinValueLead) { 366 // linear-match node 367 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 368 } else { 369 UBool isFinal=(UBool)(node&kValueIsFinal); 370 int32_t value=readValue(pos, node>>1); 371 if(haveUniqueValue) { 372 if(value!=uniqueValue) { 373 return false; 374 } 375 } else { 376 uniqueValue=value; 377 haveUniqueValue=true; 378 } 379 if(isFinal) { 380 return true; 381 } 382 pos=skipValue(pos, node); 383 } 384 } 385} 386 387int32_t 388BytesTrie::getNextBytes(ByteSink &out) const { 389 const uint8_t *pos=pos_; 390 if(pos==nullptr) { 391 return 0; 392 } 393 if(remainingMatchLength_>=0) { 394 append(out, *pos); // Next byte of a pending linear-match node. 395 return 1; 396 } 397 int32_t node=*pos++; 398 if(node>=kMinValueLead) { 399 if(node&kValueIsFinal) { 400 return 0; 401 } else { 402 pos=skipValue(pos, node); 403 node=*pos++; 404 U_ASSERT(node<kMinValueLead); 405 } 406 } 407 if(node<kMinLinearMatch) { 408 if(node==0) { 409 node=*pos++; 410 } 411 getNextBranchBytes(pos, ++node, out); 412 return node; 413 } else { 414 // First byte of the linear-match node. 415 append(out, *pos); 416 return 1; 417 } 418} 419 420void 421BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) { 422 while(length>kMaxBranchLinearSubNodeLength) { 423 ++pos; // ignore the comparison byte 424 getNextBranchBytes(jumpByDelta(pos), length>>1, out); 425 length=length-(length>>1); 426 pos=skipDelta(pos); 427 } 428 do { 429 append(out, *pos++); 430 pos=skipValue(pos); 431 } while(--length>1); 432 append(out, *pos); 433} 434 435void 436BytesTrie::append(ByteSink &out, int c) { 437 char ch=(char)c; 438 out.Append(&ch, 1); 439} 440 441U_NAMESPACE_END 442