1cb93a386Sopenharmony_ci#include "./durchschlag.h"
2cb93a386Sopenharmony_ci
3cb93a386Sopenharmony_ci#include <algorithm>
4cb93a386Sopenharmony_ci#include <exception>  /* terminate */
5cb93a386Sopenharmony_ci
6cb93a386Sopenharmony_ci#include "divsufsort.h"
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci/* Pointer to position in text. */
9cb93a386Sopenharmony_citypedef DurchschlagTextIdx TextIdx;
10cb93a386Sopenharmony_ci
11cb93a386Sopenharmony_ci/* (Sum of) value(s) of slice(s). */
12cb93a386Sopenharmony_citypedef uint32_t Score;
13cb93a386Sopenharmony_ci
14cb93a386Sopenharmony_citypedef struct HashSlot {
15cb93a386Sopenharmony_ci  TextIdx next;
16cb93a386Sopenharmony_ci  TextIdx offset;
17cb93a386Sopenharmony_ci} HashSlot;
18cb93a386Sopenharmony_ci
19cb93a386Sopenharmony_citypedef struct MetaSlot {
20cb93a386Sopenharmony_ci  TextIdx mark;
21cb93a386Sopenharmony_ci  Score score;
22cb93a386Sopenharmony_ci} MetaSlot;
23cb93a386Sopenharmony_ci
24cb93a386Sopenharmony_citypedef struct Range {
25cb93a386Sopenharmony_ci  TextIdx start;
26cb93a386Sopenharmony_ci  TextIdx end;
27cb93a386Sopenharmony_ci} Range;
28cb93a386Sopenharmony_ci
29cb93a386Sopenharmony_citypedef struct Candidate {
30cb93a386Sopenharmony_ci  Score score;
31cb93a386Sopenharmony_ci  TextIdx position;
32cb93a386Sopenharmony_ci} Candidate;
33cb93a386Sopenharmony_ci
34cb93a386Sopenharmony_cistruct greaterScore {
35cb93a386Sopenharmony_ci  bool operator()(const Candidate& a, const Candidate& b) const {
36cb93a386Sopenharmony_ci    return (a.score > b.score) ||
37cb93a386Sopenharmony_ci        ((a.score == b.score) && (a.position < b.position));
38cb93a386Sopenharmony_ci  }
39cb93a386Sopenharmony_ci};
40cb93a386Sopenharmony_ci
41cb93a386Sopenharmony_cistruct lessScore {
42cb93a386Sopenharmony_ci  bool operator()(const Candidate& a, const Candidate& b) const {
43cb93a386Sopenharmony_ci    return (a.score < b.score) ||
44cb93a386Sopenharmony_ci        ((a.score == b.score) && (a.position > b.position));
45cb93a386Sopenharmony_ci  }
46cb93a386Sopenharmony_ci};
47cb93a386Sopenharmony_ci
48cb93a386Sopenharmony_ci#define CANDIDATE_BUNDLE_SIZE (1 << 18)
49cb93a386Sopenharmony_ci
50cb93a386Sopenharmony_cistatic void fatal(const char* error) {
51cb93a386Sopenharmony_ci  fprintf(stderr, "%s\n", error);
52cb93a386Sopenharmony_ci  std::terminate();
53cb93a386Sopenharmony_ci}
54cb93a386Sopenharmony_ci
55cb93a386Sopenharmony_cistatic TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
56cb93a386Sopenharmony_ci  TextIdx result = 0;
57cb93a386Sopenharmony_ci  for (size_t i = 0; i < ranges.size(); ++i) {
58cb93a386Sopenharmony_ci    const Range& r = ranges[i];
59cb93a386Sopenharmony_ci    result += r.end - r.start;
60cb93a386Sopenharmony_ci  }
61cb93a386Sopenharmony_ci  return result;
62cb93a386Sopenharmony_ci}
63cb93a386Sopenharmony_ci
64cb93a386Sopenharmony_cistatic std::string createDictionary(
65cb93a386Sopenharmony_ci    const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
66cb93a386Sopenharmony_ci  std::string output;
67cb93a386Sopenharmony_ci  output.reserve(calculateDictionarySize(ranges));
68cb93a386Sopenharmony_ci  for (size_t i = 0; i < ranges.size(); ++i) {
69cb93a386Sopenharmony_ci    const Range& r = ranges[i];
70cb93a386Sopenharmony_ci    output.insert(output.end(), &data[r.start], &data[r.end]);
71cb93a386Sopenharmony_ci  }
72cb93a386Sopenharmony_ci  if (output.size() > limit) {
73cb93a386Sopenharmony_ci    output.resize(limit);
74cb93a386Sopenharmony_ci  }
75cb93a386Sopenharmony_ci  return output;
76cb93a386Sopenharmony_ci}
77cb93a386Sopenharmony_ci
78cb93a386Sopenharmony_ci/* precondition: span > 0
79cb93a386Sopenharmony_ci   precondition: end + span == len(shortcut) */
80cb93a386Sopenharmony_cistatic Score buildCandidatesList(std::vector<Candidate>* candidates,
81cb93a386Sopenharmony_ci    std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
82cb93a386Sopenharmony_ci    TextIdx end) {
83cb93a386Sopenharmony_ci  candidates->resize(0);
84cb93a386Sopenharmony_ci
85cb93a386Sopenharmony_ci  size_t n = map->size();
86cb93a386Sopenharmony_ci  MetaSlot* slots = map->data();
87cb93a386Sopenharmony_ci  for (size_t j = 0; j < n; ++j) {
88cb93a386Sopenharmony_ci    slots[j].mark = 0;
89cb93a386Sopenharmony_ci  }
90cb93a386Sopenharmony_ci
91cb93a386Sopenharmony_ci  Score score = 0;
92cb93a386Sopenharmony_ci  /* Consider the whole span, except one last item. The following loop will
93cb93a386Sopenharmony_ci     add the last item to the end of the "chain", evaluate it, and cut one
94cb93a386Sopenharmony_ci     "link" form the beginning. */
95cb93a386Sopenharmony_ci  for (size_t j = 0; j < span - 1; ++j) {
96cb93a386Sopenharmony_ci    MetaSlot& item = slots[shortcut[j]];
97cb93a386Sopenharmony_ci    if (item.mark == 0) {
98cb93a386Sopenharmony_ci      score += item.score;
99cb93a386Sopenharmony_ci    }
100cb93a386Sopenharmony_ci    item.mark++;
101cb93a386Sopenharmony_ci  }
102cb93a386Sopenharmony_ci
103cb93a386Sopenharmony_ci  TextIdx i = 0;
104cb93a386Sopenharmony_ci  TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
105cb93a386Sopenharmony_ci  Score maxScore = 0;
106cb93a386Sopenharmony_ci  for (; i < limit; ++i) {
107cb93a386Sopenharmony_ci    TextIdx slice = shortcut[i + span - 1];
108cb93a386Sopenharmony_ci    MetaSlot& pick = slots[slice];
109cb93a386Sopenharmony_ci    if (pick.mark == 0) {
110cb93a386Sopenharmony_ci      score += pick.score;
111cb93a386Sopenharmony_ci    }
112cb93a386Sopenharmony_ci    pick.mark++;
113cb93a386Sopenharmony_ci
114cb93a386Sopenharmony_ci    if (score > maxScore) {
115cb93a386Sopenharmony_ci      maxScore = score;
116cb93a386Sopenharmony_ci    }
117cb93a386Sopenharmony_ci    candidates->push_back({score, i});
118cb93a386Sopenharmony_ci
119cb93a386Sopenharmony_ci    MetaSlot& drop = slots[shortcut[i]];
120cb93a386Sopenharmony_ci    drop.mark--;
121cb93a386Sopenharmony_ci    if (drop.mark == 0) {
122cb93a386Sopenharmony_ci      score -= drop.score;
123cb93a386Sopenharmony_ci    }
124cb93a386Sopenharmony_ci  }
125cb93a386Sopenharmony_ci
126cb93a386Sopenharmony_ci  std::make_heap(candidates->begin(), candidates->end(), greaterScore());
127cb93a386Sopenharmony_ci  Score minScore = candidates->at(0).score;
128cb93a386Sopenharmony_ci  for (; i < end; ++i) {
129cb93a386Sopenharmony_ci    TextIdx slice = shortcut[i + span - 1];
130cb93a386Sopenharmony_ci    MetaSlot& pick = slots[slice];
131cb93a386Sopenharmony_ci    if (pick.mark == 0) {
132cb93a386Sopenharmony_ci      score += pick.score;
133cb93a386Sopenharmony_ci    }
134cb93a386Sopenharmony_ci    pick.mark++;
135cb93a386Sopenharmony_ci
136cb93a386Sopenharmony_ci    if (score > maxScore) {
137cb93a386Sopenharmony_ci      maxScore = score;
138cb93a386Sopenharmony_ci    }
139cb93a386Sopenharmony_ci    if (score >= minScore) {
140cb93a386Sopenharmony_ci      candidates->push_back({score, i});
141cb93a386Sopenharmony_ci      std::push_heap(candidates->begin(), candidates->end(), greaterScore());
142cb93a386Sopenharmony_ci      if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
143cb93a386Sopenharmony_ci        while (candidates->at(0).score == minScore) {
144cb93a386Sopenharmony_ci          std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
145cb93a386Sopenharmony_ci          candidates->pop_back();
146cb93a386Sopenharmony_ci        }
147cb93a386Sopenharmony_ci        minScore = candidates->at(0).score;
148cb93a386Sopenharmony_ci      }
149cb93a386Sopenharmony_ci    }
150cb93a386Sopenharmony_ci
151cb93a386Sopenharmony_ci    MetaSlot& drop = slots[shortcut[i]];
152cb93a386Sopenharmony_ci    drop.mark--;
153cb93a386Sopenharmony_ci    if (drop.mark == 0) {
154cb93a386Sopenharmony_ci      score -= drop.score;
155cb93a386Sopenharmony_ci    }
156cb93a386Sopenharmony_ci  }
157cb93a386Sopenharmony_ci
158cb93a386Sopenharmony_ci  for (size_t j = 0; j < n; ++j) {
159cb93a386Sopenharmony_ci    slots[j].mark = 0;
160cb93a386Sopenharmony_ci  }
161cb93a386Sopenharmony_ci
162cb93a386Sopenharmony_ci  std::make_heap(candidates->begin(), candidates->end(), lessScore());
163cb93a386Sopenharmony_ci  return minScore;
164cb93a386Sopenharmony_ci}
165cb93a386Sopenharmony_ci
166cb93a386Sopenharmony_ci/* precondition: span > 0
167cb93a386Sopenharmony_ci   precondition: end + span == len(shortcut) */
168cb93a386Sopenharmony_cistatic Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
169cb93a386Sopenharmony_ci    std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
170cb93a386Sopenharmony_ci    TextIdx end, TextIdx* next) {
171cb93a386Sopenharmony_ci  size_t n = candidates->size();
172cb93a386Sopenharmony_ci  TextIdx* data = candidates->data();
173cb93a386Sopenharmony_ci  for (size_t i = 0; i < n; ++i) {
174cb93a386Sopenharmony_ci    data[i] = 0;
175cb93a386Sopenharmony_ci  }
176cb93a386Sopenharmony_ci
177cb93a386Sopenharmony_ci  n = map->size();
178cb93a386Sopenharmony_ci  MetaSlot* slots = map->data();
179cb93a386Sopenharmony_ci  for (size_t i = 0; i < n; ++i) {
180cb93a386Sopenharmony_ci    slots[i].mark = 0;
181cb93a386Sopenharmony_ci  }
182cb93a386Sopenharmony_ci
183cb93a386Sopenharmony_ci  Score score = 0;
184cb93a386Sopenharmony_ci  /* Consider the whole span, except one last item. The following loop will
185cb93a386Sopenharmony_ci     add the last item to the end of the "chain", evaluate it, and cut one
186cb93a386Sopenharmony_ci     "link" form the beginning. */
187cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < span - 1; ++i) {
188cb93a386Sopenharmony_ci    MetaSlot& item = slots[shortcut[i]];
189cb93a386Sopenharmony_ci    if (item.mark == 0) {
190cb93a386Sopenharmony_ci      score += item.score;
191cb93a386Sopenharmony_ci    }
192cb93a386Sopenharmony_ci    item.mark++;
193cb93a386Sopenharmony_ci  }
194cb93a386Sopenharmony_ci
195cb93a386Sopenharmony_ci  Score maxScore = 0;
196cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < end; ++i) {
197cb93a386Sopenharmony_ci    MetaSlot& pick = slots[shortcut[i + span - 1]];
198cb93a386Sopenharmony_ci    if (pick.mark == 0) {
199cb93a386Sopenharmony_ci      score += pick.score;
200cb93a386Sopenharmony_ci    }
201cb93a386Sopenharmony_ci    pick.mark++;
202cb93a386Sopenharmony_ci
203cb93a386Sopenharmony_ci    if (candidates->size() <= score) {
204cb93a386Sopenharmony_ci      candidates->resize(score + 1);
205cb93a386Sopenharmony_ci    }
206cb93a386Sopenharmony_ci    if (score > maxScore) {
207cb93a386Sopenharmony_ci      maxScore = score;
208cb93a386Sopenharmony_ci    }
209cb93a386Sopenharmony_ci    next[i] = candidates->at(score);
210cb93a386Sopenharmony_ci    candidates->at(score) = i;
211cb93a386Sopenharmony_ci
212cb93a386Sopenharmony_ci    MetaSlot& drop = slots[shortcut[i]];
213cb93a386Sopenharmony_ci    drop.mark--;
214cb93a386Sopenharmony_ci    if (drop.mark == 0) {
215cb93a386Sopenharmony_ci      score -= drop.score;
216cb93a386Sopenharmony_ci    }
217cb93a386Sopenharmony_ci  }
218cb93a386Sopenharmony_ci
219cb93a386Sopenharmony_ci  for (size_t i = 0; i < n; ++i) {
220cb93a386Sopenharmony_ci    slots[i].mark = 0;
221cb93a386Sopenharmony_ci  }
222cb93a386Sopenharmony_ci
223cb93a386Sopenharmony_ci  candidates->resize(maxScore + 1);
224cb93a386Sopenharmony_ci  return maxScore;
225cb93a386Sopenharmony_ci}
226cb93a386Sopenharmony_ci
227cb93a386Sopenharmony_cistatic void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
228cb93a386Sopenharmony_ci  for (auto it = ranges->begin(); it != ranges->end();) {
229cb93a386Sopenharmony_ci    if (end < it->start) {
230cb93a386Sopenharmony_ci      ranges->insert(it, {start, end});
231cb93a386Sopenharmony_ci      return;
232cb93a386Sopenharmony_ci    }
233cb93a386Sopenharmony_ci    if (it->end < start) {
234cb93a386Sopenharmony_ci      it++;
235cb93a386Sopenharmony_ci      continue;
236cb93a386Sopenharmony_ci    }
237cb93a386Sopenharmony_ci    // Combine with existing.
238cb93a386Sopenharmony_ci    start = std::min(start, it->start);
239cb93a386Sopenharmony_ci    end = std::max(end, it->end);
240cb93a386Sopenharmony_ci    // Remove consumed vector and continue.
241cb93a386Sopenharmony_ci    it = ranges->erase(it);
242cb93a386Sopenharmony_ci  }
243cb93a386Sopenharmony_ci  ranges->push_back({start, end});
244cb93a386Sopenharmony_ci}
245cb93a386Sopenharmony_ci
246cb93a386Sopenharmony_cistd::string durchschlag_generate(
247cb93a386Sopenharmony_ci    size_t dictionary_size_limit, size_t slice_len, size_t block_len,
248cb93a386Sopenharmony_ci    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
249cb93a386Sopenharmony_ci  DurchschlagContext ctx = durchschlag_prepare(
250cb93a386Sopenharmony_ci      slice_len, sample_sizes, sample_data);
251cb93a386Sopenharmony_ci  return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
252cb93a386Sopenharmony_ci      dictionary_size_limit, block_len, ctx, sample_data);
253cb93a386Sopenharmony_ci}
254cb93a386Sopenharmony_ci
255cb93a386Sopenharmony_ciDurchschlagContext durchschlag_prepare(size_t slice_len,
256cb93a386Sopenharmony_ci    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
257cb93a386Sopenharmony_ci  /* Parameters aliasing */
258cb93a386Sopenharmony_ci  TextIdx sliceLen = static_cast<TextIdx>(slice_len);
259cb93a386Sopenharmony_ci  if (sliceLen != slice_len) fatal("slice_len is too large");
260cb93a386Sopenharmony_ci  if (sliceLen < 1) fatal("slice_len is too small");
261cb93a386Sopenharmony_ci  const uint8_t* data = sample_data;
262cb93a386Sopenharmony_ci
263cb93a386Sopenharmony_ci  TextIdx total = 0;
264cb93a386Sopenharmony_ci  std::vector<TextIdx> offsets;
265cb93a386Sopenharmony_ci  offsets.reserve(sample_sizes.size());
266cb93a386Sopenharmony_ci  for (size_t i = 0; i < sample_sizes.size(); ++i) {
267cb93a386Sopenharmony_ci    TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
268cb93a386Sopenharmony_ci    if (delta != sample_sizes[i]) fatal("sample is too large");
269cb93a386Sopenharmony_ci    if (delta == 0) fatal("0-length samples are prohibited");
270cb93a386Sopenharmony_ci    TextIdx next_total = total + delta;
271cb93a386Sopenharmony_ci    if (next_total <= total) fatal("corpus is too large");
272cb93a386Sopenharmony_ci    total = next_total;
273cb93a386Sopenharmony_ci    offsets.push_back(total);
274cb93a386Sopenharmony_ci  }
275cb93a386Sopenharmony_ci
276cb93a386Sopenharmony_ci  if (total < sliceLen) fatal("slice_len is larger than corpus size");
277cb93a386Sopenharmony_ci  TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
278cb93a386Sopenharmony_ci  TextIdx hashLen = 11;
279cb93a386Sopenharmony_ci  while (hashLen < 29 && ((1u << hashLen) < end)) {
280cb93a386Sopenharmony_ci    hashLen += 3;
281cb93a386Sopenharmony_ci  }
282cb93a386Sopenharmony_ci  hashLen -= 3;
283cb93a386Sopenharmony_ci  TextIdx hashMask = (1u << hashLen) - 1u;
284cb93a386Sopenharmony_ci  std::vector<TextIdx> hashHead(1 << hashLen);
285cb93a386Sopenharmony_ci  TextIdx hash = 0;
286cb93a386Sopenharmony_ci  TextIdx lShift = 3;
287cb93a386Sopenharmony_ci  TextIdx rShift = hashLen - lShift;
288cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < sliceLen - 1; ++i) {
289cb93a386Sopenharmony_ci    TextIdx v = data[i];
290cb93a386Sopenharmony_ci    hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
291cb93a386Sopenharmony_ci  }
292cb93a386Sopenharmony_ci  TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
293cb93a386Sopenharmony_ci  TextIdx rShiftX = hashLen - lShiftX;
294cb93a386Sopenharmony_ci
295cb93a386Sopenharmony_ci  std::vector<HashSlot> map;
296cb93a386Sopenharmony_ci  map.push_back({0, 0});
297cb93a386Sopenharmony_ci  TextIdx hashSlot = 1;
298cb93a386Sopenharmony_ci  std::vector<TextIdx> sliceMap;
299cb93a386Sopenharmony_ci  sliceMap.reserve(end);
300cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < end; ++i) {
301cb93a386Sopenharmony_ci    TextIdx v = data[i + sliceLen - 1];
302cb93a386Sopenharmony_ci    TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
303cb93a386Sopenharmony_ci    v = data[i];
304cb93a386Sopenharmony_ci    hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
305cb93a386Sopenharmony_ci    TextIdx slot = hashHead[bucket];
306cb93a386Sopenharmony_ci    while (slot != 0) {
307cb93a386Sopenharmony_ci      HashSlot& item = map[slot];
308cb93a386Sopenharmony_ci      TextIdx start = item.offset;
309cb93a386Sopenharmony_ci      bool miss = false;
310cb93a386Sopenharmony_ci      for (TextIdx j = 0; j < sliceLen; ++j) {
311cb93a386Sopenharmony_ci        if (data[i + j] != data[start + j]) {
312cb93a386Sopenharmony_ci          miss = true;
313cb93a386Sopenharmony_ci          break;
314cb93a386Sopenharmony_ci        }
315cb93a386Sopenharmony_ci      }
316cb93a386Sopenharmony_ci      if (!miss) {
317cb93a386Sopenharmony_ci        sliceMap.push_back(slot);
318cb93a386Sopenharmony_ci        break;
319cb93a386Sopenharmony_ci      }
320cb93a386Sopenharmony_ci      slot = item.next;
321cb93a386Sopenharmony_ci    }
322cb93a386Sopenharmony_ci    if (slot == 0) {
323cb93a386Sopenharmony_ci      map.push_back({hashHead[bucket], i});
324cb93a386Sopenharmony_ci      hashHead[bucket] = hashSlot;
325cb93a386Sopenharmony_ci      sliceMap.push_back(hashSlot);
326cb93a386Sopenharmony_ci      hashSlot++;
327cb93a386Sopenharmony_ci    }
328cb93a386Sopenharmony_ci  }
329cb93a386Sopenharmony_ci
330cb93a386Sopenharmony_ci  return {total, sliceLen, static_cast<TextIdx>(map.size()),
331cb93a386Sopenharmony_ci      std::move(offsets), std::move(sliceMap)};
332cb93a386Sopenharmony_ci}
333cb93a386Sopenharmony_ci
334cb93a386Sopenharmony_ciDurchschlagContext durchschlag_prepare(size_t slice_len,
335cb93a386Sopenharmony_ci    const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
336cb93a386Sopenharmony_ci  /* Parameters aliasing */
337cb93a386Sopenharmony_ci  TextIdx sliceLen = static_cast<TextIdx>(slice_len);
338cb93a386Sopenharmony_ci  if (sliceLen != slice_len) fatal("slice_len is too large");
339cb93a386Sopenharmony_ci  if (sliceLen < 1) fatal("slice_len is too small");
340cb93a386Sopenharmony_ci  const TextIdx* lcp = index.lcp.data();
341cb93a386Sopenharmony_ci  const TextIdx* sa = index.sa.data();
342cb93a386Sopenharmony_ci
343cb93a386Sopenharmony_ci  TextIdx total = 0;
344cb93a386Sopenharmony_ci  std::vector<TextIdx> offsets;
345cb93a386Sopenharmony_ci  offsets.reserve(sample_sizes.size());
346cb93a386Sopenharmony_ci  for (size_t i = 0; i < sample_sizes.size(); ++i) {
347cb93a386Sopenharmony_ci    TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
348cb93a386Sopenharmony_ci    if (delta != sample_sizes[i]) fatal("sample is too large");
349cb93a386Sopenharmony_ci    if (delta == 0) fatal("0-length samples are prohibited");
350cb93a386Sopenharmony_ci    TextIdx next_total = total + delta;
351cb93a386Sopenharmony_ci    if (next_total <= total) fatal("corpus is too large");
352cb93a386Sopenharmony_ci    total = next_total;
353cb93a386Sopenharmony_ci    offsets.push_back(total);
354cb93a386Sopenharmony_ci  }
355cb93a386Sopenharmony_ci
356cb93a386Sopenharmony_ci  if (total < sliceLen) fatal("slice_len is larger than corpus size");
357cb93a386Sopenharmony_ci  TextIdx counter = 1;
358cb93a386Sopenharmony_ci  TextIdx end = total - sliceLen + 1;
359cb93a386Sopenharmony_ci  std::vector<TextIdx> sliceMap(total);
360cb93a386Sopenharmony_ci  TextIdx last = 0;
361cb93a386Sopenharmony_ci  TextIdx current = 1;
362cb93a386Sopenharmony_ci  while (current <= total) {
363cb93a386Sopenharmony_ci    if (lcp[current - 1] < sliceLen) {
364cb93a386Sopenharmony_ci      for (TextIdx i = last; i < current; ++i) {
365cb93a386Sopenharmony_ci        sliceMap[sa[i]] = counter;
366cb93a386Sopenharmony_ci      }
367cb93a386Sopenharmony_ci      counter++;
368cb93a386Sopenharmony_ci      last = current;
369cb93a386Sopenharmony_ci    }
370cb93a386Sopenharmony_ci    current++;
371cb93a386Sopenharmony_ci  }
372cb93a386Sopenharmony_ci  sliceMap.resize(end);
373cb93a386Sopenharmony_ci
374cb93a386Sopenharmony_ci  // Reorder items for the better locality.
375cb93a386Sopenharmony_ci  std::vector<TextIdx> reorder(counter);
376cb93a386Sopenharmony_ci  counter = 1;
377cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < end; ++i) {
378cb93a386Sopenharmony_ci    if (reorder[sliceMap[i]] == 0) {
379cb93a386Sopenharmony_ci      reorder[sliceMap[i]] = counter++;
380cb93a386Sopenharmony_ci    }
381cb93a386Sopenharmony_ci  }
382cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < end; ++i) {
383cb93a386Sopenharmony_ci    sliceMap[i] = reorder[sliceMap[i]];
384cb93a386Sopenharmony_ci  }
385cb93a386Sopenharmony_ci
386cb93a386Sopenharmony_ci  return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
387cb93a386Sopenharmony_ci}
388cb93a386Sopenharmony_ci
389cb93a386Sopenharmony_ciDurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
390cb93a386Sopenharmony_ci  TextIdx total = static_cast<TextIdx>(data.size());
391cb93a386Sopenharmony_ci  if (total != data.size()) fatal("corpus is too large");
392cb93a386Sopenharmony_ci  saidx_t saTotal = static_cast<saidx_t>(total);
393cb93a386Sopenharmony_ci  if (saTotal < 0) fatal("corpus is too large");
394cb93a386Sopenharmony_ci  if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
395cb93a386Sopenharmony_ci  std::vector<TextIdx> sa(total);
396cb93a386Sopenharmony_ci  /* Hopefully, non-negative int32_t values match TextIdx ones. */
397cb93a386Sopenharmony_ci  if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
398cb93a386Sopenharmony_ci  int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
399cb93a386Sopenharmony_ci  divsufsort(data.data(), saData, saTotal);
400cb93a386Sopenharmony_ci
401cb93a386Sopenharmony_ci  std::vector<TextIdx> isa(total);
402cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
403cb93a386Sopenharmony_ci
404cb93a386Sopenharmony_ci  // TODO: borrowed -> unknown efficiency.
405cb93a386Sopenharmony_ci  std::vector<TextIdx> lcp(total);
406cb93a386Sopenharmony_ci  TextIdx k = 0;
407cb93a386Sopenharmony_ci  lcp[total - 1] = 0;
408cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < total; ++i) {
409cb93a386Sopenharmony_ci    TextIdx current = isa[i];
410cb93a386Sopenharmony_ci    if (current == total - 1) {
411cb93a386Sopenharmony_ci      k = 0;
412cb93a386Sopenharmony_ci      continue;
413cb93a386Sopenharmony_ci    }
414cb93a386Sopenharmony_ci    TextIdx j = sa[current + 1];  // Suffix which follow i-th suffix.
415cb93a386Sopenharmony_ci    while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
416cb93a386Sopenharmony_ci      ++k;
417cb93a386Sopenharmony_ci    }
418cb93a386Sopenharmony_ci    lcp[current] = k;
419cb93a386Sopenharmony_ci    if (k > 0) --k;
420cb93a386Sopenharmony_ci  }
421cb93a386Sopenharmony_ci
422cb93a386Sopenharmony_ci  return {std::move(lcp), std::move(sa)};
423cb93a386Sopenharmony_ci}
424cb93a386Sopenharmony_ci
425cb93a386Sopenharmony_cistatic void ScoreSlices(const std::vector<TextIdx>& offsets,
426cb93a386Sopenharmony_ci    std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
427cb93a386Sopenharmony_ci  TextIdx piece = 0;
428cb93a386Sopenharmony_ci  /* Fresh map contains all zeroes -> initial mark should be different. */
429cb93a386Sopenharmony_ci  TextIdx mark = 1;
430cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < end; ++i) {
431cb93a386Sopenharmony_ci    if (offsets[piece] == i) {
432cb93a386Sopenharmony_ci      piece++;
433cb93a386Sopenharmony_ci      mark++;
434cb93a386Sopenharmony_ci    }
435cb93a386Sopenharmony_ci    MetaSlot& item = map[shortcut[i]];
436cb93a386Sopenharmony_ci    if (item.mark != mark) {
437cb93a386Sopenharmony_ci      item.mark = mark;
438cb93a386Sopenharmony_ci      item.score++;
439cb93a386Sopenharmony_ci    }
440cb93a386Sopenharmony_ci  }
441cb93a386Sopenharmony_ci}
442cb93a386Sopenharmony_ci
443cb93a386Sopenharmony_cistatic std::string durchschlagGenerateExclusive(
444cb93a386Sopenharmony_ci    size_t dictionary_size_limit, size_t block_len,
445cb93a386Sopenharmony_ci    const DurchschlagContext& context, const uint8_t* sample_data) {
446cb93a386Sopenharmony_ci  /* Parameters aliasing */
447cb93a386Sopenharmony_ci  TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
448cb93a386Sopenharmony_ci  if (targetSize != dictionary_size_limit) {
449cb93a386Sopenharmony_ci    fprintf(stderr, "dictionary_size_limit is too large\n");
450cb93a386Sopenharmony_ci    return "";
451cb93a386Sopenharmony_ci  }
452cb93a386Sopenharmony_ci  TextIdx sliceLen = context.sliceLen;
453cb93a386Sopenharmony_ci  TextIdx total = context.dataSize;
454cb93a386Sopenharmony_ci  TextIdx blockLen = static_cast<TextIdx>(block_len);
455cb93a386Sopenharmony_ci  if (blockLen != block_len) {
456cb93a386Sopenharmony_ci    fprintf(stderr, "block_len is too large\n");
457cb93a386Sopenharmony_ci    return "";
458cb93a386Sopenharmony_ci  }
459cb93a386Sopenharmony_ci  const uint8_t* data = sample_data;
460cb93a386Sopenharmony_ci  const std::vector<TextIdx>& offsets = context.offsets;
461cb93a386Sopenharmony_ci  std::vector<MetaSlot> map(context.numUniqueSlices);
462cb93a386Sopenharmony_ci  const TextIdx* shortcut = context.sliceMap.data();
463cb93a386Sopenharmony_ci
464cb93a386Sopenharmony_ci  /* Initialization */
465cb93a386Sopenharmony_ci  if (blockLen < sliceLen) {
466cb93a386Sopenharmony_ci    fprintf(stderr, "sliceLen is larger than block_len\n");
467cb93a386Sopenharmony_ci    return "";
468cb93a386Sopenharmony_ci  }
469cb93a386Sopenharmony_ci  if (targetSize < blockLen || total < blockLen) {
470cb93a386Sopenharmony_ci    fprintf(stderr, "block_len is too large\n");
471cb93a386Sopenharmony_ci    return "";
472cb93a386Sopenharmony_ci  }
473cb93a386Sopenharmony_ci  TextIdx end = total - sliceLen + 1;
474cb93a386Sopenharmony_ci  ScoreSlices(offsets, map, shortcut, end);
475cb93a386Sopenharmony_ci  TextIdx span = blockLen - sliceLen + 1;
476cb93a386Sopenharmony_ci  end = static_cast<TextIdx>(context.sliceMap.size()) - span;
477cb93a386Sopenharmony_ci  std::vector<TextIdx> candidates;
478cb93a386Sopenharmony_ci  std::vector<TextIdx> next(end);
479cb93a386Sopenharmony_ci  Score maxScore = rebuildCandidatesList(
480cb93a386Sopenharmony_ci      &candidates, &map, span, shortcut, end, next.data());
481cb93a386Sopenharmony_ci
482cb93a386Sopenharmony_ci  /* Block selection */
483cb93a386Sopenharmony_ci  const size_t triesLimit = (600 * 1000000) / span;
484cb93a386Sopenharmony_ci  const size_t candidatesLimit = (150 * 1000000) / span;
485cb93a386Sopenharmony_ci  std::vector<Range> ranges;
486cb93a386Sopenharmony_ci  TextIdx mark = 0;
487cb93a386Sopenharmony_ci  size_t numTries = 0;
488cb93a386Sopenharmony_ci  while (true) {
489cb93a386Sopenharmony_ci    TextIdx dictSize = calculateDictionarySize(ranges);
490cb93a386Sopenharmony_ci    size_t numCandidates = 0;
491cb93a386Sopenharmony_ci    if (dictSize > targetSize - blockLen) {
492cb93a386Sopenharmony_ci      break;
493cb93a386Sopenharmony_ci    }
494cb93a386Sopenharmony_ci    if (maxScore == 0) {
495cb93a386Sopenharmony_ci      break;
496cb93a386Sopenharmony_ci    }
497cb93a386Sopenharmony_ci    while (true) {
498cb93a386Sopenharmony_ci      TextIdx candidate = 0;
499cb93a386Sopenharmony_ci      while (maxScore > 0) {
500cb93a386Sopenharmony_ci        if (candidates[maxScore] != 0) {
501cb93a386Sopenharmony_ci          candidate = candidates[maxScore];
502cb93a386Sopenharmony_ci          candidates[maxScore] = next[candidate];
503cb93a386Sopenharmony_ci          break;
504cb93a386Sopenharmony_ci        }
505cb93a386Sopenharmony_ci        maxScore--;
506cb93a386Sopenharmony_ci      }
507cb93a386Sopenharmony_ci      if (maxScore == 0) {
508cb93a386Sopenharmony_ci        break;
509cb93a386Sopenharmony_ci      }
510cb93a386Sopenharmony_ci      mark++;
511cb93a386Sopenharmony_ci      numTries++;
512cb93a386Sopenharmony_ci      numCandidates++;
513cb93a386Sopenharmony_ci      Score score = 0;
514cb93a386Sopenharmony_ci      for (size_t j = candidate; j < candidate + span; ++j) {
515cb93a386Sopenharmony_ci        MetaSlot& item = map[shortcut[j]];
516cb93a386Sopenharmony_ci        if (item.mark != mark) {
517cb93a386Sopenharmony_ci          score += item.score;
518cb93a386Sopenharmony_ci          item.mark = mark;
519cb93a386Sopenharmony_ci        }
520cb93a386Sopenharmony_ci      }
521cb93a386Sopenharmony_ci      if (score < maxScore) {
522cb93a386Sopenharmony_ci        if (numTries < triesLimit && numCandidates < candidatesLimit) {
523cb93a386Sopenharmony_ci          next[candidate] = candidates[score];
524cb93a386Sopenharmony_ci          candidates[score] = candidate;
525cb93a386Sopenharmony_ci        } else {
526cb93a386Sopenharmony_ci          maxScore = rebuildCandidatesList(
527cb93a386Sopenharmony_ci              &candidates, &map, span, shortcut, end, next.data());
528cb93a386Sopenharmony_ci          mark = 0;
529cb93a386Sopenharmony_ci          numTries = 0;
530cb93a386Sopenharmony_ci          numCandidates = 0;
531cb93a386Sopenharmony_ci        }
532cb93a386Sopenharmony_ci        continue;
533cb93a386Sopenharmony_ci      } else if (score > maxScore) {
534cb93a386Sopenharmony_ci        fprintf(stderr, "Broken invariant\n");
535cb93a386Sopenharmony_ci        return "";
536cb93a386Sopenharmony_ci      }
537cb93a386Sopenharmony_ci      for (TextIdx j = candidate; j < candidate + span; ++j) {
538cb93a386Sopenharmony_ci        MetaSlot& item = map[shortcut[j]];
539cb93a386Sopenharmony_ci        item.score = 0;
540cb93a386Sopenharmony_ci      }
541cb93a386Sopenharmony_ci      addRange(&ranges, candidate, candidate + blockLen);
542cb93a386Sopenharmony_ci      break;
543cb93a386Sopenharmony_ci    }
544cb93a386Sopenharmony_ci  }
545cb93a386Sopenharmony_ci
546cb93a386Sopenharmony_ci  return createDictionary(data, ranges, targetSize);
547cb93a386Sopenharmony_ci}
548cb93a386Sopenharmony_ci
549cb93a386Sopenharmony_cistatic std::string durchschlagGenerateCollaborative(
550cb93a386Sopenharmony_ci    size_t dictionary_size_limit, size_t block_len,
551cb93a386Sopenharmony_ci    const DurchschlagContext& context, const uint8_t* sample_data) {
552cb93a386Sopenharmony_ci  /* Parameters aliasing */
553cb93a386Sopenharmony_ci  TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
554cb93a386Sopenharmony_ci  if (targetSize != dictionary_size_limit) {
555cb93a386Sopenharmony_ci    fprintf(stderr, "dictionary_size_limit is too large\n");
556cb93a386Sopenharmony_ci    return "";
557cb93a386Sopenharmony_ci  }
558cb93a386Sopenharmony_ci  TextIdx sliceLen = context.sliceLen;
559cb93a386Sopenharmony_ci  TextIdx total = context.dataSize;
560cb93a386Sopenharmony_ci  TextIdx blockLen = static_cast<TextIdx>(block_len);
561cb93a386Sopenharmony_ci  if (blockLen != block_len) {
562cb93a386Sopenharmony_ci    fprintf(stderr, "block_len is too large\n");
563cb93a386Sopenharmony_ci    return "";
564cb93a386Sopenharmony_ci  }
565cb93a386Sopenharmony_ci  const uint8_t* data = sample_data;
566cb93a386Sopenharmony_ci  const std::vector<TextIdx>& offsets = context.offsets;
567cb93a386Sopenharmony_ci  std::vector<MetaSlot> map(context.numUniqueSlices);
568cb93a386Sopenharmony_ci  const TextIdx* shortcut = context.sliceMap.data();
569cb93a386Sopenharmony_ci
570cb93a386Sopenharmony_ci  /* Initialization */
571cb93a386Sopenharmony_ci  if (blockLen < sliceLen) {
572cb93a386Sopenharmony_ci    fprintf(stderr, "sliceLen is larger than block_len\n");
573cb93a386Sopenharmony_ci    return "";
574cb93a386Sopenharmony_ci  }
575cb93a386Sopenharmony_ci  if (targetSize < blockLen || total < blockLen) {
576cb93a386Sopenharmony_ci    fprintf(stderr, "block_len is too large\n");
577cb93a386Sopenharmony_ci    return "";
578cb93a386Sopenharmony_ci  }
579cb93a386Sopenharmony_ci  TextIdx end = total - sliceLen + 1;
580cb93a386Sopenharmony_ci  ScoreSlices(offsets, map, shortcut, end);
581cb93a386Sopenharmony_ci  TextIdx span = blockLen - sliceLen + 1;
582cb93a386Sopenharmony_ci  end = static_cast<TextIdx>(context.sliceMap.size()) - span;
583cb93a386Sopenharmony_ci  std::vector<Candidate> candidates;
584cb93a386Sopenharmony_ci  candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
585cb93a386Sopenharmony_ci  Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
586cb93a386Sopenharmony_ci
587cb93a386Sopenharmony_ci  /* Block selection */
588cb93a386Sopenharmony_ci  std::vector<Range> ranges;
589cb93a386Sopenharmony_ci  TextIdx mark = 0;
590cb93a386Sopenharmony_ci  while (true) {
591cb93a386Sopenharmony_ci    TextIdx dictSize = calculateDictionarySize(ranges);
592cb93a386Sopenharmony_ci    if (dictSize > targetSize - blockLen) {
593cb93a386Sopenharmony_ci      break;
594cb93a386Sopenharmony_ci    }
595cb93a386Sopenharmony_ci    if (minScore == 0 && candidates.empty()) {
596cb93a386Sopenharmony_ci      break;
597cb93a386Sopenharmony_ci    }
598cb93a386Sopenharmony_ci    while (true) {
599cb93a386Sopenharmony_ci      if (candidates.empty()) {
600cb93a386Sopenharmony_ci        minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
601cb93a386Sopenharmony_ci        mark = 0;
602cb93a386Sopenharmony_ci      }
603cb93a386Sopenharmony_ci      TextIdx candidate = candidates[0].position;
604cb93a386Sopenharmony_ci      Score expectedScore = candidates[0].score;
605cb93a386Sopenharmony_ci      if (expectedScore == 0) {
606cb93a386Sopenharmony_ci        candidates.resize(0);
607cb93a386Sopenharmony_ci        break;
608cb93a386Sopenharmony_ci      }
609cb93a386Sopenharmony_ci      std::pop_heap(candidates.begin(), candidates.end(), lessScore());
610cb93a386Sopenharmony_ci      candidates.pop_back();
611cb93a386Sopenharmony_ci      mark++;
612cb93a386Sopenharmony_ci      Score score = 0;
613cb93a386Sopenharmony_ci      for (TextIdx j = candidate; j < candidate + span; ++j) {
614cb93a386Sopenharmony_ci        MetaSlot& item = map[shortcut[j]];
615cb93a386Sopenharmony_ci        if (item.mark != mark) {
616cb93a386Sopenharmony_ci          score += item.score;
617cb93a386Sopenharmony_ci          item.mark = mark;
618cb93a386Sopenharmony_ci        }
619cb93a386Sopenharmony_ci      }
620cb93a386Sopenharmony_ci      if (score < expectedScore) {
621cb93a386Sopenharmony_ci        if (score >= minScore) {
622cb93a386Sopenharmony_ci          candidates.push_back({score, candidate});
623cb93a386Sopenharmony_ci          std::push_heap(candidates.begin(), candidates.end(), lessScore());
624cb93a386Sopenharmony_ci        }
625cb93a386Sopenharmony_ci        continue;
626cb93a386Sopenharmony_ci      } else if (score > expectedScore) {
627cb93a386Sopenharmony_ci        fatal("Broken invariant");
628cb93a386Sopenharmony_ci      }
629cb93a386Sopenharmony_ci      for (TextIdx j = candidate; j < candidate + span; ++j) {
630cb93a386Sopenharmony_ci        MetaSlot& item = map[shortcut[j]];
631cb93a386Sopenharmony_ci        item.score = 0;
632cb93a386Sopenharmony_ci      }
633cb93a386Sopenharmony_ci      addRange(&ranges, candidate, candidate + blockLen);
634cb93a386Sopenharmony_ci      break;
635cb93a386Sopenharmony_ci    }
636cb93a386Sopenharmony_ci  }
637cb93a386Sopenharmony_ci
638cb93a386Sopenharmony_ci  return createDictionary(data, ranges, targetSize);
639cb93a386Sopenharmony_ci}
640cb93a386Sopenharmony_ci
641cb93a386Sopenharmony_cistd::string durchschlag_generate(DurchschalgResourceStrategy strategy,
642cb93a386Sopenharmony_ci    size_t dictionary_size_limit, size_t block_len,
643cb93a386Sopenharmony_ci    const DurchschlagContext& context, const uint8_t* sample_data) {
644cb93a386Sopenharmony_ci  if (strategy == DURCHSCHLAG_COLLABORATIVE) {
645cb93a386Sopenharmony_ci    return durchschlagGenerateCollaborative(
646cb93a386Sopenharmony_ci        dictionary_size_limit, block_len, context, sample_data);
647cb93a386Sopenharmony_ci  } else {
648cb93a386Sopenharmony_ci    return durchschlagGenerateExclusive(
649cb93a386Sopenharmony_ci        dictionary_size_limit, block_len, context, sample_data);
650cb93a386Sopenharmony_ci  }
651cb93a386Sopenharmony_ci}
652cb93a386Sopenharmony_ci
653cb93a386Sopenharmony_civoid durchschlag_distill(size_t slice_len, size_t minimum_population,
654cb93a386Sopenharmony_ci    std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
655cb93a386Sopenharmony_ci  /* Parameters aliasing */
656cb93a386Sopenharmony_ci  uint8_t* data = sample_data;
657cb93a386Sopenharmony_ci
658cb93a386Sopenharmony_ci  /* Build slice map. */
659cb93a386Sopenharmony_ci  DurchschlagContext context = durchschlag_prepare(
660cb93a386Sopenharmony_ci      slice_len, *sample_sizes, data);
661cb93a386Sopenharmony_ci
662cb93a386Sopenharmony_ci  /* Calculate slice population. */
663cb93a386Sopenharmony_ci  const std::vector<TextIdx>& offsets = context.offsets;
664cb93a386Sopenharmony_ci  std::vector<MetaSlot> map(context.numUniqueSlices);
665cb93a386Sopenharmony_ci  const TextIdx* shortcut = context.sliceMap.data();
666cb93a386Sopenharmony_ci  TextIdx sliceLen = context.sliceLen;
667cb93a386Sopenharmony_ci  TextIdx total = context.dataSize;
668cb93a386Sopenharmony_ci  TextIdx end = total - sliceLen + 1;
669cb93a386Sopenharmony_ci  ScoreSlices(offsets, map, shortcut, end);
670cb93a386Sopenharmony_ci
671cb93a386Sopenharmony_ci  /* Condense samples, omitting unique slices. */
672cb93a386Sopenharmony_ci  TextIdx readPos = 0;
673cb93a386Sopenharmony_ci  TextIdx writePos = 0;
674cb93a386Sopenharmony_ci  TextIdx lastNonUniquePos = 0;
675cb93a386Sopenharmony_ci  for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
676cb93a386Sopenharmony_ci    TextIdx sampleStart = writePos;
677cb93a386Sopenharmony_ci    TextIdx oldSampleEnd =
678cb93a386Sopenharmony_ci        readPos + static_cast<TextIdx>(sample_sizes->at(i));
679cb93a386Sopenharmony_ci    while (readPos < oldSampleEnd) {
680cb93a386Sopenharmony_ci      if (readPos < end) {
681cb93a386Sopenharmony_ci        MetaSlot& item = map[shortcut[readPos]];
682cb93a386Sopenharmony_ci        if (item.score >= minimum_population) {
683cb93a386Sopenharmony_ci          lastNonUniquePos = readPos + sliceLen;
684cb93a386Sopenharmony_ci        }
685cb93a386Sopenharmony_ci      }
686cb93a386Sopenharmony_ci      if (readPos < lastNonUniquePos) {
687cb93a386Sopenharmony_ci        data[writePos++] = data[readPos];
688cb93a386Sopenharmony_ci      }
689cb93a386Sopenharmony_ci      readPos++;
690cb93a386Sopenharmony_ci    }
691cb93a386Sopenharmony_ci    sample_sizes->at(i) = writePos - sampleStart;
692cb93a386Sopenharmony_ci  }
693cb93a386Sopenharmony_ci}
694cb93a386Sopenharmony_ci
695cb93a386Sopenharmony_civoid durchschlag_purify(size_t slice_len, size_t minimum_population,
696cb93a386Sopenharmony_ci    const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
697cb93a386Sopenharmony_ci  /* Parameters aliasing */
698cb93a386Sopenharmony_ci  uint8_t* data = sample_data;
699cb93a386Sopenharmony_ci
700cb93a386Sopenharmony_ci  /* Build slice map. */
701cb93a386Sopenharmony_ci  DurchschlagContext context = durchschlag_prepare(
702cb93a386Sopenharmony_ci      slice_len, sample_sizes, data);
703cb93a386Sopenharmony_ci
704cb93a386Sopenharmony_ci  /* Calculate slice population. */
705cb93a386Sopenharmony_ci  const std::vector<TextIdx>& offsets = context.offsets;
706cb93a386Sopenharmony_ci  std::vector<MetaSlot> map(context.numUniqueSlices);
707cb93a386Sopenharmony_ci  const TextIdx* shortcut = context.sliceMap.data();
708cb93a386Sopenharmony_ci  TextIdx sliceLen = context.sliceLen;
709cb93a386Sopenharmony_ci  TextIdx total = context.dataSize;
710cb93a386Sopenharmony_ci  TextIdx end = total - sliceLen + 1;
711cb93a386Sopenharmony_ci  ScoreSlices(offsets, map, shortcut, end);
712cb93a386Sopenharmony_ci
713cb93a386Sopenharmony_ci  /* Rewrite samples, zeroing out unique slices. */
714cb93a386Sopenharmony_ci  TextIdx lastNonUniquePos = 0;
715cb93a386Sopenharmony_ci  for (TextIdx readPos = 0; readPos < total; ++readPos) {
716cb93a386Sopenharmony_ci    if (readPos < end) {
717cb93a386Sopenharmony_ci      MetaSlot& item = map[shortcut[readPos]];
718cb93a386Sopenharmony_ci      if (item.score >= minimum_population) {
719cb93a386Sopenharmony_ci        lastNonUniquePos = readPos + sliceLen;
720cb93a386Sopenharmony_ci      }
721cb93a386Sopenharmony_ci    }
722cb93a386Sopenharmony_ci    if (readPos >= lastNonUniquePos) {
723cb93a386Sopenharmony_ci      data[readPos] = 0;
724cb93a386Sopenharmony_ci    }
725cb93a386Sopenharmony_ci  }
726cb93a386Sopenharmony_ci}
727