1cb93a386Sopenharmony_ci#include "./durchschlag.h" 2cb93a386Sopenharmony_ci 3cb93a386Sopenharmony_ci#include <algorithm> 4cb93a386Sopenharmony_ci#include <exception> /* terminate */ 5cb93a386Sopenharmony_ci 6cb93a386Sopenharmony_ci#include "divsufsort.h" 7cb93a386Sopenharmony_ci 8cb93a386Sopenharmony_ci/* Pointer to position in text. */ 9cb93a386Sopenharmony_citypedef DurchschlagTextIdx TextIdx; 10cb93a386Sopenharmony_ci 11cb93a386Sopenharmony_ci/* (Sum of) value(s) of slice(s). */ 12cb93a386Sopenharmony_citypedef uint32_t Score; 13cb93a386Sopenharmony_ci 14cb93a386Sopenharmony_citypedef struct HashSlot { 15cb93a386Sopenharmony_ci TextIdx next; 16cb93a386Sopenharmony_ci TextIdx offset; 17cb93a386Sopenharmony_ci} HashSlot; 18cb93a386Sopenharmony_ci 19cb93a386Sopenharmony_citypedef struct MetaSlot { 20cb93a386Sopenharmony_ci TextIdx mark; 21cb93a386Sopenharmony_ci Score score; 22cb93a386Sopenharmony_ci} MetaSlot; 23cb93a386Sopenharmony_ci 24cb93a386Sopenharmony_citypedef struct Range { 25cb93a386Sopenharmony_ci TextIdx start; 26cb93a386Sopenharmony_ci TextIdx end; 27cb93a386Sopenharmony_ci} Range; 28cb93a386Sopenharmony_ci 29cb93a386Sopenharmony_citypedef struct Candidate { 30cb93a386Sopenharmony_ci Score score; 31cb93a386Sopenharmony_ci TextIdx position; 32cb93a386Sopenharmony_ci} Candidate; 33cb93a386Sopenharmony_ci 34cb93a386Sopenharmony_cistruct greaterScore { 35cb93a386Sopenharmony_ci bool operator()(const Candidate& a, const Candidate& b) const { 36cb93a386Sopenharmony_ci return (a.score > b.score) || 37cb93a386Sopenharmony_ci ((a.score == b.score) && (a.position < b.position)); 38cb93a386Sopenharmony_ci } 39cb93a386Sopenharmony_ci}; 40cb93a386Sopenharmony_ci 41cb93a386Sopenharmony_cistruct lessScore { 42cb93a386Sopenharmony_ci bool operator()(const Candidate& a, const Candidate& b) const { 43cb93a386Sopenharmony_ci return (a.score < b.score) || 44cb93a386Sopenharmony_ci ((a.score == b.score) && (a.position > b.position)); 45cb93a386Sopenharmony_ci } 46cb93a386Sopenharmony_ci}; 47cb93a386Sopenharmony_ci 48cb93a386Sopenharmony_ci#define CANDIDATE_BUNDLE_SIZE (1 << 18) 49cb93a386Sopenharmony_ci 50cb93a386Sopenharmony_cistatic void fatal(const char* error) { 51cb93a386Sopenharmony_ci fprintf(stderr, "%s\n", error); 52cb93a386Sopenharmony_ci std::terminate(); 53cb93a386Sopenharmony_ci} 54cb93a386Sopenharmony_ci 55cb93a386Sopenharmony_cistatic TextIdx calculateDictionarySize(const std::vector<Range>& ranges) { 56cb93a386Sopenharmony_ci TextIdx result = 0; 57cb93a386Sopenharmony_ci for (size_t i = 0; i < ranges.size(); ++i) { 58cb93a386Sopenharmony_ci const Range& r = ranges[i]; 59cb93a386Sopenharmony_ci result += r.end - r.start; 60cb93a386Sopenharmony_ci } 61cb93a386Sopenharmony_ci return result; 62cb93a386Sopenharmony_ci} 63cb93a386Sopenharmony_ci 64cb93a386Sopenharmony_cistatic std::string createDictionary( 65cb93a386Sopenharmony_ci const uint8_t* data, const std::vector<Range>& ranges, size_t limit) { 66cb93a386Sopenharmony_ci std::string output; 67cb93a386Sopenharmony_ci output.reserve(calculateDictionarySize(ranges)); 68cb93a386Sopenharmony_ci for (size_t i = 0; i < ranges.size(); ++i) { 69cb93a386Sopenharmony_ci const Range& r = ranges[i]; 70cb93a386Sopenharmony_ci output.insert(output.end(), &data[r.start], &data[r.end]); 71cb93a386Sopenharmony_ci } 72cb93a386Sopenharmony_ci if (output.size() > limit) { 73cb93a386Sopenharmony_ci output.resize(limit); 74cb93a386Sopenharmony_ci } 75cb93a386Sopenharmony_ci return output; 76cb93a386Sopenharmony_ci} 77cb93a386Sopenharmony_ci 78cb93a386Sopenharmony_ci/* precondition: span > 0 79cb93a386Sopenharmony_ci precondition: end + span == len(shortcut) */ 80cb93a386Sopenharmony_cistatic Score buildCandidatesList(std::vector<Candidate>* candidates, 81cb93a386Sopenharmony_ci std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, 82cb93a386Sopenharmony_ci TextIdx end) { 83cb93a386Sopenharmony_ci candidates->resize(0); 84cb93a386Sopenharmony_ci 85cb93a386Sopenharmony_ci size_t n = map->size(); 86cb93a386Sopenharmony_ci MetaSlot* slots = map->data(); 87cb93a386Sopenharmony_ci for (size_t j = 0; j < n; ++j) { 88cb93a386Sopenharmony_ci slots[j].mark = 0; 89cb93a386Sopenharmony_ci } 90cb93a386Sopenharmony_ci 91cb93a386Sopenharmony_ci Score score = 0; 92cb93a386Sopenharmony_ci /* Consider the whole span, except one last item. The following loop will 93cb93a386Sopenharmony_ci add the last item to the end of the "chain", evaluate it, and cut one 94cb93a386Sopenharmony_ci "link" form the beginning. */ 95cb93a386Sopenharmony_ci for (size_t j = 0; j < span - 1; ++j) { 96cb93a386Sopenharmony_ci MetaSlot& item = slots[shortcut[j]]; 97cb93a386Sopenharmony_ci if (item.mark == 0) { 98cb93a386Sopenharmony_ci score += item.score; 99cb93a386Sopenharmony_ci } 100cb93a386Sopenharmony_ci item.mark++; 101cb93a386Sopenharmony_ci } 102cb93a386Sopenharmony_ci 103cb93a386Sopenharmony_ci TextIdx i = 0; 104cb93a386Sopenharmony_ci TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE); 105cb93a386Sopenharmony_ci Score maxScore = 0; 106cb93a386Sopenharmony_ci for (; i < limit; ++i) { 107cb93a386Sopenharmony_ci TextIdx slice = shortcut[i + span - 1]; 108cb93a386Sopenharmony_ci MetaSlot& pick = slots[slice]; 109cb93a386Sopenharmony_ci if (pick.mark == 0) { 110cb93a386Sopenharmony_ci score += pick.score; 111cb93a386Sopenharmony_ci } 112cb93a386Sopenharmony_ci pick.mark++; 113cb93a386Sopenharmony_ci 114cb93a386Sopenharmony_ci if (score > maxScore) { 115cb93a386Sopenharmony_ci maxScore = score; 116cb93a386Sopenharmony_ci } 117cb93a386Sopenharmony_ci candidates->push_back({score, i}); 118cb93a386Sopenharmony_ci 119cb93a386Sopenharmony_ci MetaSlot& drop = slots[shortcut[i]]; 120cb93a386Sopenharmony_ci drop.mark--; 121cb93a386Sopenharmony_ci if (drop.mark == 0) { 122cb93a386Sopenharmony_ci score -= drop.score; 123cb93a386Sopenharmony_ci } 124cb93a386Sopenharmony_ci } 125cb93a386Sopenharmony_ci 126cb93a386Sopenharmony_ci std::make_heap(candidates->begin(), candidates->end(), greaterScore()); 127cb93a386Sopenharmony_ci Score minScore = candidates->at(0).score; 128cb93a386Sopenharmony_ci for (; i < end; ++i) { 129cb93a386Sopenharmony_ci TextIdx slice = shortcut[i + span - 1]; 130cb93a386Sopenharmony_ci MetaSlot& pick = slots[slice]; 131cb93a386Sopenharmony_ci if (pick.mark == 0) { 132cb93a386Sopenharmony_ci score += pick.score; 133cb93a386Sopenharmony_ci } 134cb93a386Sopenharmony_ci pick.mark++; 135cb93a386Sopenharmony_ci 136cb93a386Sopenharmony_ci if (score > maxScore) { 137cb93a386Sopenharmony_ci maxScore = score; 138cb93a386Sopenharmony_ci } 139cb93a386Sopenharmony_ci if (score >= minScore) { 140cb93a386Sopenharmony_ci candidates->push_back({score, i}); 141cb93a386Sopenharmony_ci std::push_heap(candidates->begin(), candidates->end(), greaterScore()); 142cb93a386Sopenharmony_ci if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) { 143cb93a386Sopenharmony_ci while (candidates->at(0).score == minScore) { 144cb93a386Sopenharmony_ci std::pop_heap(candidates->begin(), candidates->end(), greaterScore()); 145cb93a386Sopenharmony_ci candidates->pop_back(); 146cb93a386Sopenharmony_ci } 147cb93a386Sopenharmony_ci minScore = candidates->at(0).score; 148cb93a386Sopenharmony_ci } 149cb93a386Sopenharmony_ci } 150cb93a386Sopenharmony_ci 151cb93a386Sopenharmony_ci MetaSlot& drop = slots[shortcut[i]]; 152cb93a386Sopenharmony_ci drop.mark--; 153cb93a386Sopenharmony_ci if (drop.mark == 0) { 154cb93a386Sopenharmony_ci score -= drop.score; 155cb93a386Sopenharmony_ci } 156cb93a386Sopenharmony_ci } 157cb93a386Sopenharmony_ci 158cb93a386Sopenharmony_ci for (size_t j = 0; j < n; ++j) { 159cb93a386Sopenharmony_ci slots[j].mark = 0; 160cb93a386Sopenharmony_ci } 161cb93a386Sopenharmony_ci 162cb93a386Sopenharmony_ci std::make_heap(candidates->begin(), candidates->end(), lessScore()); 163cb93a386Sopenharmony_ci return minScore; 164cb93a386Sopenharmony_ci} 165cb93a386Sopenharmony_ci 166cb93a386Sopenharmony_ci/* precondition: span > 0 167cb93a386Sopenharmony_ci precondition: end + span == len(shortcut) */ 168cb93a386Sopenharmony_cistatic Score rebuildCandidatesList(std::vector<TextIdx>* candidates, 169cb93a386Sopenharmony_ci std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, 170cb93a386Sopenharmony_ci TextIdx end, TextIdx* next) { 171cb93a386Sopenharmony_ci size_t n = candidates->size(); 172cb93a386Sopenharmony_ci TextIdx* data = candidates->data(); 173cb93a386Sopenharmony_ci for (size_t i = 0; i < n; ++i) { 174cb93a386Sopenharmony_ci data[i] = 0; 175cb93a386Sopenharmony_ci } 176cb93a386Sopenharmony_ci 177cb93a386Sopenharmony_ci n = map->size(); 178cb93a386Sopenharmony_ci MetaSlot* slots = map->data(); 179cb93a386Sopenharmony_ci for (size_t i = 0; i < n; ++i) { 180cb93a386Sopenharmony_ci slots[i].mark = 0; 181cb93a386Sopenharmony_ci } 182cb93a386Sopenharmony_ci 183cb93a386Sopenharmony_ci Score score = 0; 184cb93a386Sopenharmony_ci /* Consider the whole span, except one last item. The following loop will 185cb93a386Sopenharmony_ci add the last item to the end of the "chain", evaluate it, and cut one 186cb93a386Sopenharmony_ci "link" form the beginning. */ 187cb93a386Sopenharmony_ci for (TextIdx i = 0; i < span - 1; ++i) { 188cb93a386Sopenharmony_ci MetaSlot& item = slots[shortcut[i]]; 189cb93a386Sopenharmony_ci if (item.mark == 0) { 190cb93a386Sopenharmony_ci score += item.score; 191cb93a386Sopenharmony_ci } 192cb93a386Sopenharmony_ci item.mark++; 193cb93a386Sopenharmony_ci } 194cb93a386Sopenharmony_ci 195cb93a386Sopenharmony_ci Score maxScore = 0; 196cb93a386Sopenharmony_ci for (TextIdx i = 0; i < end; ++i) { 197cb93a386Sopenharmony_ci MetaSlot& pick = slots[shortcut[i + span - 1]]; 198cb93a386Sopenharmony_ci if (pick.mark == 0) { 199cb93a386Sopenharmony_ci score += pick.score; 200cb93a386Sopenharmony_ci } 201cb93a386Sopenharmony_ci pick.mark++; 202cb93a386Sopenharmony_ci 203cb93a386Sopenharmony_ci if (candidates->size() <= score) { 204cb93a386Sopenharmony_ci candidates->resize(score + 1); 205cb93a386Sopenharmony_ci } 206cb93a386Sopenharmony_ci if (score > maxScore) { 207cb93a386Sopenharmony_ci maxScore = score; 208cb93a386Sopenharmony_ci } 209cb93a386Sopenharmony_ci next[i] = candidates->at(score); 210cb93a386Sopenharmony_ci candidates->at(score) = i; 211cb93a386Sopenharmony_ci 212cb93a386Sopenharmony_ci MetaSlot& drop = slots[shortcut[i]]; 213cb93a386Sopenharmony_ci drop.mark--; 214cb93a386Sopenharmony_ci if (drop.mark == 0) { 215cb93a386Sopenharmony_ci score -= drop.score; 216cb93a386Sopenharmony_ci } 217cb93a386Sopenharmony_ci } 218cb93a386Sopenharmony_ci 219cb93a386Sopenharmony_ci for (size_t i = 0; i < n; ++i) { 220cb93a386Sopenharmony_ci slots[i].mark = 0; 221cb93a386Sopenharmony_ci } 222cb93a386Sopenharmony_ci 223cb93a386Sopenharmony_ci candidates->resize(maxScore + 1); 224cb93a386Sopenharmony_ci return maxScore; 225cb93a386Sopenharmony_ci} 226cb93a386Sopenharmony_ci 227cb93a386Sopenharmony_cistatic void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) { 228cb93a386Sopenharmony_ci for (auto it = ranges->begin(); it != ranges->end();) { 229cb93a386Sopenharmony_ci if (end < it->start) { 230cb93a386Sopenharmony_ci ranges->insert(it, {start, end}); 231cb93a386Sopenharmony_ci return; 232cb93a386Sopenharmony_ci } 233cb93a386Sopenharmony_ci if (it->end < start) { 234cb93a386Sopenharmony_ci it++; 235cb93a386Sopenharmony_ci continue; 236cb93a386Sopenharmony_ci } 237cb93a386Sopenharmony_ci // Combine with existing. 238cb93a386Sopenharmony_ci start = std::min(start, it->start); 239cb93a386Sopenharmony_ci end = std::max(end, it->end); 240cb93a386Sopenharmony_ci // Remove consumed vector and continue. 241cb93a386Sopenharmony_ci it = ranges->erase(it); 242cb93a386Sopenharmony_ci } 243cb93a386Sopenharmony_ci ranges->push_back({start, end}); 244cb93a386Sopenharmony_ci} 245cb93a386Sopenharmony_ci 246cb93a386Sopenharmony_cistd::string durchschlag_generate( 247cb93a386Sopenharmony_ci size_t dictionary_size_limit, size_t slice_len, size_t block_len, 248cb93a386Sopenharmony_ci const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { 249cb93a386Sopenharmony_ci DurchschlagContext ctx = durchschlag_prepare( 250cb93a386Sopenharmony_ci slice_len, sample_sizes, sample_data); 251cb93a386Sopenharmony_ci return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE, 252cb93a386Sopenharmony_ci dictionary_size_limit, block_len, ctx, sample_data); 253cb93a386Sopenharmony_ci} 254cb93a386Sopenharmony_ci 255cb93a386Sopenharmony_ciDurchschlagContext durchschlag_prepare(size_t slice_len, 256cb93a386Sopenharmony_ci const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { 257cb93a386Sopenharmony_ci /* Parameters aliasing */ 258cb93a386Sopenharmony_ci TextIdx sliceLen = static_cast<TextIdx>(slice_len); 259cb93a386Sopenharmony_ci if (sliceLen != slice_len) fatal("slice_len is too large"); 260cb93a386Sopenharmony_ci if (sliceLen < 1) fatal("slice_len is too small"); 261cb93a386Sopenharmony_ci const uint8_t* data = sample_data; 262cb93a386Sopenharmony_ci 263cb93a386Sopenharmony_ci TextIdx total = 0; 264cb93a386Sopenharmony_ci std::vector<TextIdx> offsets; 265cb93a386Sopenharmony_ci offsets.reserve(sample_sizes.size()); 266cb93a386Sopenharmony_ci for (size_t i = 0; i < sample_sizes.size(); ++i) { 267cb93a386Sopenharmony_ci TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); 268cb93a386Sopenharmony_ci if (delta != sample_sizes[i]) fatal("sample is too large"); 269cb93a386Sopenharmony_ci if (delta == 0) fatal("0-length samples are prohibited"); 270cb93a386Sopenharmony_ci TextIdx next_total = total + delta; 271cb93a386Sopenharmony_ci if (next_total <= total) fatal("corpus is too large"); 272cb93a386Sopenharmony_ci total = next_total; 273cb93a386Sopenharmony_ci offsets.push_back(total); 274cb93a386Sopenharmony_ci } 275cb93a386Sopenharmony_ci 276cb93a386Sopenharmony_ci if (total < sliceLen) fatal("slice_len is larger than corpus size"); 277cb93a386Sopenharmony_ci TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1; 278cb93a386Sopenharmony_ci TextIdx hashLen = 11; 279cb93a386Sopenharmony_ci while (hashLen < 29 && ((1u << hashLen) < end)) { 280cb93a386Sopenharmony_ci hashLen += 3; 281cb93a386Sopenharmony_ci } 282cb93a386Sopenharmony_ci hashLen -= 3; 283cb93a386Sopenharmony_ci TextIdx hashMask = (1u << hashLen) - 1u; 284cb93a386Sopenharmony_ci std::vector<TextIdx> hashHead(1 << hashLen); 285cb93a386Sopenharmony_ci TextIdx hash = 0; 286cb93a386Sopenharmony_ci TextIdx lShift = 3; 287cb93a386Sopenharmony_ci TextIdx rShift = hashLen - lShift; 288cb93a386Sopenharmony_ci for (TextIdx i = 0; i < sliceLen - 1; ++i) { 289cb93a386Sopenharmony_ci TextIdx v = data[i]; 290cb93a386Sopenharmony_ci hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; 291cb93a386Sopenharmony_ci } 292cb93a386Sopenharmony_ci TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; 293cb93a386Sopenharmony_ci TextIdx rShiftX = hashLen - lShiftX; 294cb93a386Sopenharmony_ci 295cb93a386Sopenharmony_ci std::vector<HashSlot> map; 296cb93a386Sopenharmony_ci map.push_back({0, 0}); 297cb93a386Sopenharmony_ci TextIdx hashSlot = 1; 298cb93a386Sopenharmony_ci std::vector<TextIdx> sliceMap; 299cb93a386Sopenharmony_ci sliceMap.reserve(end); 300cb93a386Sopenharmony_ci for (TextIdx i = 0; i < end; ++i) { 301cb93a386Sopenharmony_ci TextIdx v = data[i + sliceLen - 1]; 302cb93a386Sopenharmony_ci TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; 303cb93a386Sopenharmony_ci v = data[i]; 304cb93a386Sopenharmony_ci hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask); 305cb93a386Sopenharmony_ci TextIdx slot = hashHead[bucket]; 306cb93a386Sopenharmony_ci while (slot != 0) { 307cb93a386Sopenharmony_ci HashSlot& item = map[slot]; 308cb93a386Sopenharmony_ci TextIdx start = item.offset; 309cb93a386Sopenharmony_ci bool miss = false; 310cb93a386Sopenharmony_ci for (TextIdx j = 0; j < sliceLen; ++j) { 311cb93a386Sopenharmony_ci if (data[i + j] != data[start + j]) { 312cb93a386Sopenharmony_ci miss = true; 313cb93a386Sopenharmony_ci break; 314cb93a386Sopenharmony_ci } 315cb93a386Sopenharmony_ci } 316cb93a386Sopenharmony_ci if (!miss) { 317cb93a386Sopenharmony_ci sliceMap.push_back(slot); 318cb93a386Sopenharmony_ci break; 319cb93a386Sopenharmony_ci } 320cb93a386Sopenharmony_ci slot = item.next; 321cb93a386Sopenharmony_ci } 322cb93a386Sopenharmony_ci if (slot == 0) { 323cb93a386Sopenharmony_ci map.push_back({hashHead[bucket], i}); 324cb93a386Sopenharmony_ci hashHead[bucket] = hashSlot; 325cb93a386Sopenharmony_ci sliceMap.push_back(hashSlot); 326cb93a386Sopenharmony_ci hashSlot++; 327cb93a386Sopenharmony_ci } 328cb93a386Sopenharmony_ci } 329cb93a386Sopenharmony_ci 330cb93a386Sopenharmony_ci return {total, sliceLen, static_cast<TextIdx>(map.size()), 331cb93a386Sopenharmony_ci std::move(offsets), std::move(sliceMap)}; 332cb93a386Sopenharmony_ci} 333cb93a386Sopenharmony_ci 334cb93a386Sopenharmony_ciDurchschlagContext durchschlag_prepare(size_t slice_len, 335cb93a386Sopenharmony_ci const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) { 336cb93a386Sopenharmony_ci /* Parameters aliasing */ 337cb93a386Sopenharmony_ci TextIdx sliceLen = static_cast<TextIdx>(slice_len); 338cb93a386Sopenharmony_ci if (sliceLen != slice_len) fatal("slice_len is too large"); 339cb93a386Sopenharmony_ci if (sliceLen < 1) fatal("slice_len is too small"); 340cb93a386Sopenharmony_ci const TextIdx* lcp = index.lcp.data(); 341cb93a386Sopenharmony_ci const TextIdx* sa = index.sa.data(); 342cb93a386Sopenharmony_ci 343cb93a386Sopenharmony_ci TextIdx total = 0; 344cb93a386Sopenharmony_ci std::vector<TextIdx> offsets; 345cb93a386Sopenharmony_ci offsets.reserve(sample_sizes.size()); 346cb93a386Sopenharmony_ci for (size_t i = 0; i < sample_sizes.size(); ++i) { 347cb93a386Sopenharmony_ci TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); 348cb93a386Sopenharmony_ci if (delta != sample_sizes[i]) fatal("sample is too large"); 349cb93a386Sopenharmony_ci if (delta == 0) fatal("0-length samples are prohibited"); 350cb93a386Sopenharmony_ci TextIdx next_total = total + delta; 351cb93a386Sopenharmony_ci if (next_total <= total) fatal("corpus is too large"); 352cb93a386Sopenharmony_ci total = next_total; 353cb93a386Sopenharmony_ci offsets.push_back(total); 354cb93a386Sopenharmony_ci } 355cb93a386Sopenharmony_ci 356cb93a386Sopenharmony_ci if (total < sliceLen) fatal("slice_len is larger than corpus size"); 357cb93a386Sopenharmony_ci TextIdx counter = 1; 358cb93a386Sopenharmony_ci TextIdx end = total - sliceLen + 1; 359cb93a386Sopenharmony_ci std::vector<TextIdx> sliceMap(total); 360cb93a386Sopenharmony_ci TextIdx last = 0; 361cb93a386Sopenharmony_ci TextIdx current = 1; 362cb93a386Sopenharmony_ci while (current <= total) { 363cb93a386Sopenharmony_ci if (lcp[current - 1] < sliceLen) { 364cb93a386Sopenharmony_ci for (TextIdx i = last; i < current; ++i) { 365cb93a386Sopenharmony_ci sliceMap[sa[i]] = counter; 366cb93a386Sopenharmony_ci } 367cb93a386Sopenharmony_ci counter++; 368cb93a386Sopenharmony_ci last = current; 369cb93a386Sopenharmony_ci } 370cb93a386Sopenharmony_ci current++; 371cb93a386Sopenharmony_ci } 372cb93a386Sopenharmony_ci sliceMap.resize(end); 373cb93a386Sopenharmony_ci 374cb93a386Sopenharmony_ci // Reorder items for the better locality. 375cb93a386Sopenharmony_ci std::vector<TextIdx> reorder(counter); 376cb93a386Sopenharmony_ci counter = 1; 377cb93a386Sopenharmony_ci for (TextIdx i = 0; i < end; ++i) { 378cb93a386Sopenharmony_ci if (reorder[sliceMap[i]] == 0) { 379cb93a386Sopenharmony_ci reorder[sliceMap[i]] = counter++; 380cb93a386Sopenharmony_ci } 381cb93a386Sopenharmony_ci } 382cb93a386Sopenharmony_ci for (TextIdx i = 0; i < end; ++i) { 383cb93a386Sopenharmony_ci sliceMap[i] = reorder[sliceMap[i]]; 384cb93a386Sopenharmony_ci } 385cb93a386Sopenharmony_ci 386cb93a386Sopenharmony_ci return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)}; 387cb93a386Sopenharmony_ci} 388cb93a386Sopenharmony_ci 389cb93a386Sopenharmony_ciDurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) { 390cb93a386Sopenharmony_ci TextIdx total = static_cast<TextIdx>(data.size()); 391cb93a386Sopenharmony_ci if (total != data.size()) fatal("corpus is too large"); 392cb93a386Sopenharmony_ci saidx_t saTotal = static_cast<saidx_t>(total); 393cb93a386Sopenharmony_ci if (saTotal < 0) fatal("corpus is too large"); 394cb93a386Sopenharmony_ci if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large"); 395cb93a386Sopenharmony_ci std::vector<TextIdx> sa(total); 396cb93a386Sopenharmony_ci /* Hopefully, non-negative int32_t values match TextIdx ones. */ 397cb93a386Sopenharmony_ci if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch"); 398cb93a386Sopenharmony_ci int32_t* saData = reinterpret_cast<int32_t*>(sa.data()); 399cb93a386Sopenharmony_ci divsufsort(data.data(), saData, saTotal); 400cb93a386Sopenharmony_ci 401cb93a386Sopenharmony_ci std::vector<TextIdx> isa(total); 402cb93a386Sopenharmony_ci for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i; 403cb93a386Sopenharmony_ci 404cb93a386Sopenharmony_ci // TODO: borrowed -> unknown efficiency. 405cb93a386Sopenharmony_ci std::vector<TextIdx> lcp(total); 406cb93a386Sopenharmony_ci TextIdx k = 0; 407cb93a386Sopenharmony_ci lcp[total - 1] = 0; 408cb93a386Sopenharmony_ci for (TextIdx i = 0; i < total; ++i) { 409cb93a386Sopenharmony_ci TextIdx current = isa[i]; 410cb93a386Sopenharmony_ci if (current == total - 1) { 411cb93a386Sopenharmony_ci k = 0; 412cb93a386Sopenharmony_ci continue; 413cb93a386Sopenharmony_ci } 414cb93a386Sopenharmony_ci TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix. 415cb93a386Sopenharmony_ci while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) { 416cb93a386Sopenharmony_ci ++k; 417cb93a386Sopenharmony_ci } 418cb93a386Sopenharmony_ci lcp[current] = k; 419cb93a386Sopenharmony_ci if (k > 0) --k; 420cb93a386Sopenharmony_ci } 421cb93a386Sopenharmony_ci 422cb93a386Sopenharmony_ci return {std::move(lcp), std::move(sa)}; 423cb93a386Sopenharmony_ci} 424cb93a386Sopenharmony_ci 425cb93a386Sopenharmony_cistatic void ScoreSlices(const std::vector<TextIdx>& offsets, 426cb93a386Sopenharmony_ci std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) { 427cb93a386Sopenharmony_ci TextIdx piece = 0; 428cb93a386Sopenharmony_ci /* Fresh map contains all zeroes -> initial mark should be different. */ 429cb93a386Sopenharmony_ci TextIdx mark = 1; 430cb93a386Sopenharmony_ci for (TextIdx i = 0; i < end; ++i) { 431cb93a386Sopenharmony_ci if (offsets[piece] == i) { 432cb93a386Sopenharmony_ci piece++; 433cb93a386Sopenharmony_ci mark++; 434cb93a386Sopenharmony_ci } 435cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[i]]; 436cb93a386Sopenharmony_ci if (item.mark != mark) { 437cb93a386Sopenharmony_ci item.mark = mark; 438cb93a386Sopenharmony_ci item.score++; 439cb93a386Sopenharmony_ci } 440cb93a386Sopenharmony_ci } 441cb93a386Sopenharmony_ci} 442cb93a386Sopenharmony_ci 443cb93a386Sopenharmony_cistatic std::string durchschlagGenerateExclusive( 444cb93a386Sopenharmony_ci size_t dictionary_size_limit, size_t block_len, 445cb93a386Sopenharmony_ci const DurchschlagContext& context, const uint8_t* sample_data) { 446cb93a386Sopenharmony_ci /* Parameters aliasing */ 447cb93a386Sopenharmony_ci TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); 448cb93a386Sopenharmony_ci if (targetSize != dictionary_size_limit) { 449cb93a386Sopenharmony_ci fprintf(stderr, "dictionary_size_limit is too large\n"); 450cb93a386Sopenharmony_ci return ""; 451cb93a386Sopenharmony_ci } 452cb93a386Sopenharmony_ci TextIdx sliceLen = context.sliceLen; 453cb93a386Sopenharmony_ci TextIdx total = context.dataSize; 454cb93a386Sopenharmony_ci TextIdx blockLen = static_cast<TextIdx>(block_len); 455cb93a386Sopenharmony_ci if (blockLen != block_len) { 456cb93a386Sopenharmony_ci fprintf(stderr, "block_len is too large\n"); 457cb93a386Sopenharmony_ci return ""; 458cb93a386Sopenharmony_ci } 459cb93a386Sopenharmony_ci const uint8_t* data = sample_data; 460cb93a386Sopenharmony_ci const std::vector<TextIdx>& offsets = context.offsets; 461cb93a386Sopenharmony_ci std::vector<MetaSlot> map(context.numUniqueSlices); 462cb93a386Sopenharmony_ci const TextIdx* shortcut = context.sliceMap.data(); 463cb93a386Sopenharmony_ci 464cb93a386Sopenharmony_ci /* Initialization */ 465cb93a386Sopenharmony_ci if (blockLen < sliceLen) { 466cb93a386Sopenharmony_ci fprintf(stderr, "sliceLen is larger than block_len\n"); 467cb93a386Sopenharmony_ci return ""; 468cb93a386Sopenharmony_ci } 469cb93a386Sopenharmony_ci if (targetSize < blockLen || total < blockLen) { 470cb93a386Sopenharmony_ci fprintf(stderr, "block_len is too large\n"); 471cb93a386Sopenharmony_ci return ""; 472cb93a386Sopenharmony_ci } 473cb93a386Sopenharmony_ci TextIdx end = total - sliceLen + 1; 474cb93a386Sopenharmony_ci ScoreSlices(offsets, map, shortcut, end); 475cb93a386Sopenharmony_ci TextIdx span = blockLen - sliceLen + 1; 476cb93a386Sopenharmony_ci end = static_cast<TextIdx>(context.sliceMap.size()) - span; 477cb93a386Sopenharmony_ci std::vector<TextIdx> candidates; 478cb93a386Sopenharmony_ci std::vector<TextIdx> next(end); 479cb93a386Sopenharmony_ci Score maxScore = rebuildCandidatesList( 480cb93a386Sopenharmony_ci &candidates, &map, span, shortcut, end, next.data()); 481cb93a386Sopenharmony_ci 482cb93a386Sopenharmony_ci /* Block selection */ 483cb93a386Sopenharmony_ci const size_t triesLimit = (600 * 1000000) / span; 484cb93a386Sopenharmony_ci const size_t candidatesLimit = (150 * 1000000) / span; 485cb93a386Sopenharmony_ci std::vector<Range> ranges; 486cb93a386Sopenharmony_ci TextIdx mark = 0; 487cb93a386Sopenharmony_ci size_t numTries = 0; 488cb93a386Sopenharmony_ci while (true) { 489cb93a386Sopenharmony_ci TextIdx dictSize = calculateDictionarySize(ranges); 490cb93a386Sopenharmony_ci size_t numCandidates = 0; 491cb93a386Sopenharmony_ci if (dictSize > targetSize - blockLen) { 492cb93a386Sopenharmony_ci break; 493cb93a386Sopenharmony_ci } 494cb93a386Sopenharmony_ci if (maxScore == 0) { 495cb93a386Sopenharmony_ci break; 496cb93a386Sopenharmony_ci } 497cb93a386Sopenharmony_ci while (true) { 498cb93a386Sopenharmony_ci TextIdx candidate = 0; 499cb93a386Sopenharmony_ci while (maxScore > 0) { 500cb93a386Sopenharmony_ci if (candidates[maxScore] != 0) { 501cb93a386Sopenharmony_ci candidate = candidates[maxScore]; 502cb93a386Sopenharmony_ci candidates[maxScore] = next[candidate]; 503cb93a386Sopenharmony_ci break; 504cb93a386Sopenharmony_ci } 505cb93a386Sopenharmony_ci maxScore--; 506cb93a386Sopenharmony_ci } 507cb93a386Sopenharmony_ci if (maxScore == 0) { 508cb93a386Sopenharmony_ci break; 509cb93a386Sopenharmony_ci } 510cb93a386Sopenharmony_ci mark++; 511cb93a386Sopenharmony_ci numTries++; 512cb93a386Sopenharmony_ci numCandidates++; 513cb93a386Sopenharmony_ci Score score = 0; 514cb93a386Sopenharmony_ci for (size_t j = candidate; j < candidate + span; ++j) { 515cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[j]]; 516cb93a386Sopenharmony_ci if (item.mark != mark) { 517cb93a386Sopenharmony_ci score += item.score; 518cb93a386Sopenharmony_ci item.mark = mark; 519cb93a386Sopenharmony_ci } 520cb93a386Sopenharmony_ci } 521cb93a386Sopenharmony_ci if (score < maxScore) { 522cb93a386Sopenharmony_ci if (numTries < triesLimit && numCandidates < candidatesLimit) { 523cb93a386Sopenharmony_ci next[candidate] = candidates[score]; 524cb93a386Sopenharmony_ci candidates[score] = candidate; 525cb93a386Sopenharmony_ci } else { 526cb93a386Sopenharmony_ci maxScore = rebuildCandidatesList( 527cb93a386Sopenharmony_ci &candidates, &map, span, shortcut, end, next.data()); 528cb93a386Sopenharmony_ci mark = 0; 529cb93a386Sopenharmony_ci numTries = 0; 530cb93a386Sopenharmony_ci numCandidates = 0; 531cb93a386Sopenharmony_ci } 532cb93a386Sopenharmony_ci continue; 533cb93a386Sopenharmony_ci } else if (score > maxScore) { 534cb93a386Sopenharmony_ci fprintf(stderr, "Broken invariant\n"); 535cb93a386Sopenharmony_ci return ""; 536cb93a386Sopenharmony_ci } 537cb93a386Sopenharmony_ci for (TextIdx j = candidate; j < candidate + span; ++j) { 538cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[j]]; 539cb93a386Sopenharmony_ci item.score = 0; 540cb93a386Sopenharmony_ci } 541cb93a386Sopenharmony_ci addRange(&ranges, candidate, candidate + blockLen); 542cb93a386Sopenharmony_ci break; 543cb93a386Sopenharmony_ci } 544cb93a386Sopenharmony_ci } 545cb93a386Sopenharmony_ci 546cb93a386Sopenharmony_ci return createDictionary(data, ranges, targetSize); 547cb93a386Sopenharmony_ci} 548cb93a386Sopenharmony_ci 549cb93a386Sopenharmony_cistatic std::string durchschlagGenerateCollaborative( 550cb93a386Sopenharmony_ci size_t dictionary_size_limit, size_t block_len, 551cb93a386Sopenharmony_ci const DurchschlagContext& context, const uint8_t* sample_data) { 552cb93a386Sopenharmony_ci /* Parameters aliasing */ 553cb93a386Sopenharmony_ci TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); 554cb93a386Sopenharmony_ci if (targetSize != dictionary_size_limit) { 555cb93a386Sopenharmony_ci fprintf(stderr, "dictionary_size_limit is too large\n"); 556cb93a386Sopenharmony_ci return ""; 557cb93a386Sopenharmony_ci } 558cb93a386Sopenharmony_ci TextIdx sliceLen = context.sliceLen; 559cb93a386Sopenharmony_ci TextIdx total = context.dataSize; 560cb93a386Sopenharmony_ci TextIdx blockLen = static_cast<TextIdx>(block_len); 561cb93a386Sopenharmony_ci if (blockLen != block_len) { 562cb93a386Sopenharmony_ci fprintf(stderr, "block_len is too large\n"); 563cb93a386Sopenharmony_ci return ""; 564cb93a386Sopenharmony_ci } 565cb93a386Sopenharmony_ci const uint8_t* data = sample_data; 566cb93a386Sopenharmony_ci const std::vector<TextIdx>& offsets = context.offsets; 567cb93a386Sopenharmony_ci std::vector<MetaSlot> map(context.numUniqueSlices); 568cb93a386Sopenharmony_ci const TextIdx* shortcut = context.sliceMap.data(); 569cb93a386Sopenharmony_ci 570cb93a386Sopenharmony_ci /* Initialization */ 571cb93a386Sopenharmony_ci if (blockLen < sliceLen) { 572cb93a386Sopenharmony_ci fprintf(stderr, "sliceLen is larger than block_len\n"); 573cb93a386Sopenharmony_ci return ""; 574cb93a386Sopenharmony_ci } 575cb93a386Sopenharmony_ci if (targetSize < blockLen || total < blockLen) { 576cb93a386Sopenharmony_ci fprintf(stderr, "block_len is too large\n"); 577cb93a386Sopenharmony_ci return ""; 578cb93a386Sopenharmony_ci } 579cb93a386Sopenharmony_ci TextIdx end = total - sliceLen + 1; 580cb93a386Sopenharmony_ci ScoreSlices(offsets, map, shortcut, end); 581cb93a386Sopenharmony_ci TextIdx span = blockLen - sliceLen + 1; 582cb93a386Sopenharmony_ci end = static_cast<TextIdx>(context.sliceMap.size()) - span; 583cb93a386Sopenharmony_ci std::vector<Candidate> candidates; 584cb93a386Sopenharmony_ci candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024); 585cb93a386Sopenharmony_ci Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); 586cb93a386Sopenharmony_ci 587cb93a386Sopenharmony_ci /* Block selection */ 588cb93a386Sopenharmony_ci std::vector<Range> ranges; 589cb93a386Sopenharmony_ci TextIdx mark = 0; 590cb93a386Sopenharmony_ci while (true) { 591cb93a386Sopenharmony_ci TextIdx dictSize = calculateDictionarySize(ranges); 592cb93a386Sopenharmony_ci if (dictSize > targetSize - blockLen) { 593cb93a386Sopenharmony_ci break; 594cb93a386Sopenharmony_ci } 595cb93a386Sopenharmony_ci if (minScore == 0 && candidates.empty()) { 596cb93a386Sopenharmony_ci break; 597cb93a386Sopenharmony_ci } 598cb93a386Sopenharmony_ci while (true) { 599cb93a386Sopenharmony_ci if (candidates.empty()) { 600cb93a386Sopenharmony_ci minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); 601cb93a386Sopenharmony_ci mark = 0; 602cb93a386Sopenharmony_ci } 603cb93a386Sopenharmony_ci TextIdx candidate = candidates[0].position; 604cb93a386Sopenharmony_ci Score expectedScore = candidates[0].score; 605cb93a386Sopenharmony_ci if (expectedScore == 0) { 606cb93a386Sopenharmony_ci candidates.resize(0); 607cb93a386Sopenharmony_ci break; 608cb93a386Sopenharmony_ci } 609cb93a386Sopenharmony_ci std::pop_heap(candidates.begin(), candidates.end(), lessScore()); 610cb93a386Sopenharmony_ci candidates.pop_back(); 611cb93a386Sopenharmony_ci mark++; 612cb93a386Sopenharmony_ci Score score = 0; 613cb93a386Sopenharmony_ci for (TextIdx j = candidate; j < candidate + span; ++j) { 614cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[j]]; 615cb93a386Sopenharmony_ci if (item.mark != mark) { 616cb93a386Sopenharmony_ci score += item.score; 617cb93a386Sopenharmony_ci item.mark = mark; 618cb93a386Sopenharmony_ci } 619cb93a386Sopenharmony_ci } 620cb93a386Sopenharmony_ci if (score < expectedScore) { 621cb93a386Sopenharmony_ci if (score >= minScore) { 622cb93a386Sopenharmony_ci candidates.push_back({score, candidate}); 623cb93a386Sopenharmony_ci std::push_heap(candidates.begin(), candidates.end(), lessScore()); 624cb93a386Sopenharmony_ci } 625cb93a386Sopenharmony_ci continue; 626cb93a386Sopenharmony_ci } else if (score > expectedScore) { 627cb93a386Sopenharmony_ci fatal("Broken invariant"); 628cb93a386Sopenharmony_ci } 629cb93a386Sopenharmony_ci for (TextIdx j = candidate; j < candidate + span; ++j) { 630cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[j]]; 631cb93a386Sopenharmony_ci item.score = 0; 632cb93a386Sopenharmony_ci } 633cb93a386Sopenharmony_ci addRange(&ranges, candidate, candidate + blockLen); 634cb93a386Sopenharmony_ci break; 635cb93a386Sopenharmony_ci } 636cb93a386Sopenharmony_ci } 637cb93a386Sopenharmony_ci 638cb93a386Sopenharmony_ci return createDictionary(data, ranges, targetSize); 639cb93a386Sopenharmony_ci} 640cb93a386Sopenharmony_ci 641cb93a386Sopenharmony_cistd::string durchschlag_generate(DurchschalgResourceStrategy strategy, 642cb93a386Sopenharmony_ci size_t dictionary_size_limit, size_t block_len, 643cb93a386Sopenharmony_ci const DurchschlagContext& context, const uint8_t* sample_data) { 644cb93a386Sopenharmony_ci if (strategy == DURCHSCHLAG_COLLABORATIVE) { 645cb93a386Sopenharmony_ci return durchschlagGenerateCollaborative( 646cb93a386Sopenharmony_ci dictionary_size_limit, block_len, context, sample_data); 647cb93a386Sopenharmony_ci } else { 648cb93a386Sopenharmony_ci return durchschlagGenerateExclusive( 649cb93a386Sopenharmony_ci dictionary_size_limit, block_len, context, sample_data); 650cb93a386Sopenharmony_ci } 651cb93a386Sopenharmony_ci} 652cb93a386Sopenharmony_ci 653cb93a386Sopenharmony_civoid durchschlag_distill(size_t slice_len, size_t minimum_population, 654cb93a386Sopenharmony_ci std::vector<size_t>* sample_sizes, uint8_t* sample_data) { 655cb93a386Sopenharmony_ci /* Parameters aliasing */ 656cb93a386Sopenharmony_ci uint8_t* data = sample_data; 657cb93a386Sopenharmony_ci 658cb93a386Sopenharmony_ci /* Build slice map. */ 659cb93a386Sopenharmony_ci DurchschlagContext context = durchschlag_prepare( 660cb93a386Sopenharmony_ci slice_len, *sample_sizes, data); 661cb93a386Sopenharmony_ci 662cb93a386Sopenharmony_ci /* Calculate slice population. */ 663cb93a386Sopenharmony_ci const std::vector<TextIdx>& offsets = context.offsets; 664cb93a386Sopenharmony_ci std::vector<MetaSlot> map(context.numUniqueSlices); 665cb93a386Sopenharmony_ci const TextIdx* shortcut = context.sliceMap.data(); 666cb93a386Sopenharmony_ci TextIdx sliceLen = context.sliceLen; 667cb93a386Sopenharmony_ci TextIdx total = context.dataSize; 668cb93a386Sopenharmony_ci TextIdx end = total - sliceLen + 1; 669cb93a386Sopenharmony_ci ScoreSlices(offsets, map, shortcut, end); 670cb93a386Sopenharmony_ci 671cb93a386Sopenharmony_ci /* Condense samples, omitting unique slices. */ 672cb93a386Sopenharmony_ci TextIdx readPos = 0; 673cb93a386Sopenharmony_ci TextIdx writePos = 0; 674cb93a386Sopenharmony_ci TextIdx lastNonUniquePos = 0; 675cb93a386Sopenharmony_ci for (TextIdx i = 0; i < sample_sizes->size(); ++i) { 676cb93a386Sopenharmony_ci TextIdx sampleStart = writePos; 677cb93a386Sopenharmony_ci TextIdx oldSampleEnd = 678cb93a386Sopenharmony_ci readPos + static_cast<TextIdx>(sample_sizes->at(i)); 679cb93a386Sopenharmony_ci while (readPos < oldSampleEnd) { 680cb93a386Sopenharmony_ci if (readPos < end) { 681cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[readPos]]; 682cb93a386Sopenharmony_ci if (item.score >= minimum_population) { 683cb93a386Sopenharmony_ci lastNonUniquePos = readPos + sliceLen; 684cb93a386Sopenharmony_ci } 685cb93a386Sopenharmony_ci } 686cb93a386Sopenharmony_ci if (readPos < lastNonUniquePos) { 687cb93a386Sopenharmony_ci data[writePos++] = data[readPos]; 688cb93a386Sopenharmony_ci } 689cb93a386Sopenharmony_ci readPos++; 690cb93a386Sopenharmony_ci } 691cb93a386Sopenharmony_ci sample_sizes->at(i) = writePos - sampleStart; 692cb93a386Sopenharmony_ci } 693cb93a386Sopenharmony_ci} 694cb93a386Sopenharmony_ci 695cb93a386Sopenharmony_civoid durchschlag_purify(size_t slice_len, size_t minimum_population, 696cb93a386Sopenharmony_ci const std::vector<size_t>& sample_sizes, uint8_t* sample_data) { 697cb93a386Sopenharmony_ci /* Parameters aliasing */ 698cb93a386Sopenharmony_ci uint8_t* data = sample_data; 699cb93a386Sopenharmony_ci 700cb93a386Sopenharmony_ci /* Build slice map. */ 701cb93a386Sopenharmony_ci DurchschlagContext context = durchschlag_prepare( 702cb93a386Sopenharmony_ci slice_len, sample_sizes, data); 703cb93a386Sopenharmony_ci 704cb93a386Sopenharmony_ci /* Calculate slice population. */ 705cb93a386Sopenharmony_ci const std::vector<TextIdx>& offsets = context.offsets; 706cb93a386Sopenharmony_ci std::vector<MetaSlot> map(context.numUniqueSlices); 707cb93a386Sopenharmony_ci const TextIdx* shortcut = context.sliceMap.data(); 708cb93a386Sopenharmony_ci TextIdx sliceLen = context.sliceLen; 709cb93a386Sopenharmony_ci TextIdx total = context.dataSize; 710cb93a386Sopenharmony_ci TextIdx end = total - sliceLen + 1; 711cb93a386Sopenharmony_ci ScoreSlices(offsets, map, shortcut, end); 712cb93a386Sopenharmony_ci 713cb93a386Sopenharmony_ci /* Rewrite samples, zeroing out unique slices. */ 714cb93a386Sopenharmony_ci TextIdx lastNonUniquePos = 0; 715cb93a386Sopenharmony_ci for (TextIdx readPos = 0; readPos < total; ++readPos) { 716cb93a386Sopenharmony_ci if (readPos < end) { 717cb93a386Sopenharmony_ci MetaSlot& item = map[shortcut[readPos]]; 718cb93a386Sopenharmony_ci if (item.score >= minimum_population) { 719cb93a386Sopenharmony_ci lastNonUniquePos = readPos + sliceLen; 720cb93a386Sopenharmony_ci } 721cb93a386Sopenharmony_ci } 722cb93a386Sopenharmony_ci if (readPos >= lastNonUniquePos) { 723cb93a386Sopenharmony_ci data[readPos] = 0; 724cb93a386Sopenharmony_ci } 725cb93a386Sopenharmony_ci } 726cb93a386Sopenharmony_ci} 727