1fb6c1f39Sopenharmony_ci#!/usr/bin/env python
2fb6c1f39Sopenharmony_ci
3fb6c1f39Sopenharmony_ci# This does simple normalized frequency analysis on UTF-8 encoded text. The
4fb6c1f39Sopenharmony_ci# result of the analysis is translated to a ranked list, where every byte is
5fb6c1f39Sopenharmony_ci# assigned a rank. This list is written to src/freqs.rs.
6fb6c1f39Sopenharmony_ci#
7fb6c1f39Sopenharmony_ci# Currently, the frequencies are generated from the following corpuses:
8fb6c1f39Sopenharmony_ci#
9fb6c1f39Sopenharmony_ci#   * The CIA world fact book
10fb6c1f39Sopenharmony_ci#   * The source code of rustc
11fb6c1f39Sopenharmony_ci#   * Septuaginta
12fb6c1f39Sopenharmony_ci
13fb6c1f39Sopenharmony_cifrom __future__ import absolute_import, division, print_function
14fb6c1f39Sopenharmony_ci
15fb6c1f39Sopenharmony_ciimport argparse
16fb6c1f39Sopenharmony_cifrom collections import Counter
17fb6c1f39Sopenharmony_ciimport sys
18fb6c1f39Sopenharmony_ci
19fb6c1f39Sopenharmony_cipreamble = '''
20fb6c1f39Sopenharmony_ci// NOTE: The following code was generated by "scripts/frequencies.py", do not
21fb6c1f39Sopenharmony_ci// edit directly
22fb6c1f39Sopenharmony_ci'''.lstrip()
23fb6c1f39Sopenharmony_ci
24fb6c1f39Sopenharmony_ci
25fb6c1f39Sopenharmony_cidef eprint(*args, **kwargs):
26fb6c1f39Sopenharmony_ci    kwargs['file'] = sys.stderr
27fb6c1f39Sopenharmony_ci    print(*args, **kwargs)
28fb6c1f39Sopenharmony_ci
29fb6c1f39Sopenharmony_ci
30fb6c1f39Sopenharmony_cidef main():
31fb6c1f39Sopenharmony_ci    p = argparse.ArgumentParser()
32fb6c1f39Sopenharmony_ci    p.add_argument('corpus', metavar='FILE', nargs='+')
33fb6c1f39Sopenharmony_ci    args = p.parse_args()
34fb6c1f39Sopenharmony_ci
35fb6c1f39Sopenharmony_ci    # Get frequency counts of each byte.
36fb6c1f39Sopenharmony_ci    freqs = Counter()
37fb6c1f39Sopenharmony_ci    for i in range(0, 256):
38fb6c1f39Sopenharmony_ci        freqs[i] = 0
39fb6c1f39Sopenharmony_ci
40fb6c1f39Sopenharmony_ci    eprint('reading entire corpus into memory')
41fb6c1f39Sopenharmony_ci    corpus = []
42fb6c1f39Sopenharmony_ci    for fpath in args.corpus:
43fb6c1f39Sopenharmony_ci        corpus.append(open(fpath, 'rb').read())
44fb6c1f39Sopenharmony_ci
45fb6c1f39Sopenharmony_ci    eprint('computing byte frequencies')
46fb6c1f39Sopenharmony_ci    for c in corpus:
47fb6c1f39Sopenharmony_ci        for byte in c:
48fb6c1f39Sopenharmony_ci            freqs[byte] += 1.0 / float(len(c))
49fb6c1f39Sopenharmony_ci
50fb6c1f39Sopenharmony_ci    eprint('writing Rust code')
51fb6c1f39Sopenharmony_ci    # Get the rank of each byte. A lower rank => lower relative frequency.
52fb6c1f39Sopenharmony_ci    rank = [0] * 256
53fb6c1f39Sopenharmony_ci    for i, (byte, _) in enumerate(freqs.most_common()):
54fb6c1f39Sopenharmony_ci        # print(byte)
55fb6c1f39Sopenharmony_ci        rank[byte] = 255 - i
56fb6c1f39Sopenharmony_ci
57fb6c1f39Sopenharmony_ci    # Forcefully set the highest rank possible for bytes that start multi-byte
58fb6c1f39Sopenharmony_ci    # UTF-8 sequences. The idea here is that a continuation byte will be more
59fb6c1f39Sopenharmony_ci    # discerning in a homogenous haystack.
60fb6c1f39Sopenharmony_ci    for byte in range(0xC0, 0xFF + 1):
61fb6c1f39Sopenharmony_ci        rank[byte] = 255
62fb6c1f39Sopenharmony_ci
63fb6c1f39Sopenharmony_ci    # Now write Rust.
64fb6c1f39Sopenharmony_ci    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
65fb6c1f39Sopenharmony_ci    for byte in range(256):
66fb6c1f39Sopenharmony_ci        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
67fb6c1f39Sopenharmony_ci    olines.append('];')
68fb6c1f39Sopenharmony_ci
69fb6c1f39Sopenharmony_ci    print(preamble)
70fb6c1f39Sopenharmony_ci    print('\n'.join(olines))
71fb6c1f39Sopenharmony_ci
72fb6c1f39Sopenharmony_ci
73fb6c1f39Sopenharmony_ciif __name__ == '__main__':
74fb6c1f39Sopenharmony_ci    main()
75