1fb6c1f39Sopenharmony_ci#!/usr/bin/env python 2fb6c1f39Sopenharmony_ci 3fb6c1f39Sopenharmony_ci# This does simple normalized frequency analysis on UTF-8 encoded text. The 4fb6c1f39Sopenharmony_ci# result of the analysis is translated to a ranked list, where every byte is 5fb6c1f39Sopenharmony_ci# assigned a rank. This list is written to src/freqs.rs. 6fb6c1f39Sopenharmony_ci# 7fb6c1f39Sopenharmony_ci# Currently, the frequencies are generated from the following corpuses: 8fb6c1f39Sopenharmony_ci# 9fb6c1f39Sopenharmony_ci# * The CIA world fact book 10fb6c1f39Sopenharmony_ci# * The source code of rustc 11fb6c1f39Sopenharmony_ci# * Septuaginta 12fb6c1f39Sopenharmony_ci 13fb6c1f39Sopenharmony_cifrom __future__ import absolute_import, division, print_function 14fb6c1f39Sopenharmony_ci 15fb6c1f39Sopenharmony_ciimport argparse 16fb6c1f39Sopenharmony_cifrom collections import Counter 17fb6c1f39Sopenharmony_ciimport sys 18fb6c1f39Sopenharmony_ci 19fb6c1f39Sopenharmony_cipreamble = ''' 20fb6c1f39Sopenharmony_ci// NOTE: The following code was generated by "scripts/frequencies.py", do not 21fb6c1f39Sopenharmony_ci// edit directly 22fb6c1f39Sopenharmony_ci'''.lstrip() 23fb6c1f39Sopenharmony_ci 24fb6c1f39Sopenharmony_ci 25fb6c1f39Sopenharmony_cidef eprint(*args, **kwargs): 26fb6c1f39Sopenharmony_ci kwargs['file'] = sys.stderr 27fb6c1f39Sopenharmony_ci print(*args, **kwargs) 28fb6c1f39Sopenharmony_ci 29fb6c1f39Sopenharmony_ci 30fb6c1f39Sopenharmony_cidef main(): 31fb6c1f39Sopenharmony_ci p = argparse.ArgumentParser() 32fb6c1f39Sopenharmony_ci p.add_argument('corpus', metavar='FILE', nargs='+') 33fb6c1f39Sopenharmony_ci args = p.parse_args() 34fb6c1f39Sopenharmony_ci 35fb6c1f39Sopenharmony_ci # Get frequency counts of each byte. 36fb6c1f39Sopenharmony_ci freqs = Counter() 37fb6c1f39Sopenharmony_ci for i in range(0, 256): 38fb6c1f39Sopenharmony_ci freqs[i] = 0 39fb6c1f39Sopenharmony_ci 40fb6c1f39Sopenharmony_ci eprint('reading entire corpus into memory') 41fb6c1f39Sopenharmony_ci corpus = [] 42fb6c1f39Sopenharmony_ci for fpath in args.corpus: 43fb6c1f39Sopenharmony_ci corpus.append(open(fpath, 'rb').read()) 44fb6c1f39Sopenharmony_ci 45fb6c1f39Sopenharmony_ci eprint('computing byte frequencies') 46fb6c1f39Sopenharmony_ci for c in corpus: 47fb6c1f39Sopenharmony_ci for byte in c: 48fb6c1f39Sopenharmony_ci freqs[byte] += 1.0 / float(len(c)) 49fb6c1f39Sopenharmony_ci 50fb6c1f39Sopenharmony_ci eprint('writing Rust code') 51fb6c1f39Sopenharmony_ci # Get the rank of each byte. A lower rank => lower relative frequency. 52fb6c1f39Sopenharmony_ci rank = [0] * 256 53fb6c1f39Sopenharmony_ci for i, (byte, _) in enumerate(freqs.most_common()): 54fb6c1f39Sopenharmony_ci # print(byte) 55fb6c1f39Sopenharmony_ci rank[byte] = 255 - i 56fb6c1f39Sopenharmony_ci 57fb6c1f39Sopenharmony_ci # Forcefully set the highest rank possible for bytes that start multi-byte 58fb6c1f39Sopenharmony_ci # UTF-8 sequences. The idea here is that a continuation byte will be more 59fb6c1f39Sopenharmony_ci # discerning in a homogenous haystack. 60fb6c1f39Sopenharmony_ci for byte in range(0xC0, 0xFF + 1): 61fb6c1f39Sopenharmony_ci rank[byte] = 255 62fb6c1f39Sopenharmony_ci 63fb6c1f39Sopenharmony_ci # Now write Rust. 64fb6c1f39Sopenharmony_ci olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] 65fb6c1f39Sopenharmony_ci for byte in range(256): 66fb6c1f39Sopenharmony_ci olines.append(' %3d, // %r' % (rank[byte], chr(byte))) 67fb6c1f39Sopenharmony_ci olines.append('];') 68fb6c1f39Sopenharmony_ci 69fb6c1f39Sopenharmony_ci print(preamble) 70fb6c1f39Sopenharmony_ci print('\n'.join(olines)) 71fb6c1f39Sopenharmony_ci 72fb6c1f39Sopenharmony_ci 73fb6c1f39Sopenharmony_ciif __name__ == '__main__': 74fb6c1f39Sopenharmony_ci main() 75