1c67d6573Sopenharmony_ci#!/bin/sh
2c67d6573Sopenharmony_ci
3c67d6573Sopenharmony_ci# This script is responsible for generating some of the Unicode tables used
4c67d6573Sopenharmony_ci# in regex-syntax.
5c67d6573Sopenharmony_ci#
6c67d6573Sopenharmony_ci# Usage is simple, first download the Unicode data:
7c67d6573Sopenharmony_ci#
8c67d6573Sopenharmony_ci#   $ mkdir ucd
9c67d6573Sopenharmony_ci#   $ cd ucd
10c67d6573Sopenharmony_ci#   $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip
11c67d6573Sopenharmony_ci#   $ unzip UCD.zip
12c67d6573Sopenharmony_ci#
13c67d6573Sopenharmony_ci# And then run this script from the root of this repository by pointing it at
14c67d6573Sopenharmony_ci# the data directory downloaded above:
15c67d6573Sopenharmony_ci#
16c67d6573Sopenharmony_ci#   $ ./scripts/generate-unicode-tables path/to/ucd
17c67d6573Sopenharmony_ci#
18c67d6573Sopenharmony_ci# Once complete, if you are upgrading to a new version of Unicode,
19c67d6573Sopenharmony_ci# you'll need to add a new "age" value to the 'ages' routine in
20c67d6573Sopenharmony_ci# regex-syntax/src/unicode.rs.
21c67d6573Sopenharmony_ci
22c67d6573Sopenharmony_ciif [ $# != 1 ]; then
23c67d6573Sopenharmony_ci    echo "Usage: $(basename "$0") <ucd-data-directory>" >&2
24c67d6573Sopenharmony_ci    exit 1
25c67d6573Sopenharmony_cifi
26c67d6573Sopenharmony_ciucddir="$1"
27c67d6573Sopenharmony_ci
28c67d6573Sopenharmony_ciout="regex-syntax/src/unicode_tables"
29c67d6573Sopenharmony_ciucd-generate age "$ucddir" \
30c67d6573Sopenharmony_ci    --chars > "$out/age.rs"
31c67d6573Sopenharmony_ciucd-generate case-folding-simple "$ucddir" \
32c67d6573Sopenharmony_ci    --chars --all-pairs > "$out/case_folding_simple.rs"
33c67d6573Sopenharmony_ciucd-generate general-category "$ucddir" \
34c67d6573Sopenharmony_ci    --chars --exclude surrogate > "$out/general_category.rs"
35c67d6573Sopenharmony_ciucd-generate grapheme-cluster-break "$ucddir" \
36c67d6573Sopenharmony_ci    --chars > "$out/grapheme_cluster_break.rs"
37c67d6573Sopenharmony_ciucd-generate property-bool "$ucddir" \
38c67d6573Sopenharmony_ci    --chars > "$out/property_bool.rs"
39c67d6573Sopenharmony_ciucd-generate property-names "$ucddir" \
40c67d6573Sopenharmony_ci    > "$out/property_names.rs"
41c67d6573Sopenharmony_ciucd-generate property-values "$ucddir" \
42c67d6573Sopenharmony_ci    --include gc,script,scx,age,gcb,wb,sb > "$out/property_values.rs"
43c67d6573Sopenharmony_ciucd-generate script "$ucddir" \
44c67d6573Sopenharmony_ci    --chars > "$out/script.rs"
45c67d6573Sopenharmony_ciucd-generate script-extension "$ucddir" \
46c67d6573Sopenharmony_ci    --chars > "$out/script_extension.rs"
47c67d6573Sopenharmony_ciucd-generate sentence-break "$ucddir" \
48c67d6573Sopenharmony_ci    --chars > "$out/sentence_break.rs"
49c67d6573Sopenharmony_ciucd-generate word-break "$ucddir" \
50c67d6573Sopenharmony_ci    --chars > "$out/word_break.rs"
51c67d6573Sopenharmony_ci
52c67d6573Sopenharmony_ci# These generate the \w, \d and \s Unicode-aware character classes. \d and \s
53c67d6573Sopenharmony_ci# are technically part of the general category and boolean properties generated
54c67d6573Sopenharmony_ci# above. However, these are generated separately to make it possible to enable
55c67d6573Sopenharmony_ci# or disable them via Cargo features independently of whether all boolean
56c67d6573Sopenharmony_ci# properties or general categories are enabled or disabled. The crate ensures
57c67d6573Sopenharmony_ci# that only one copy is compiled.
58c67d6573Sopenharmony_ciucd-generate perl-word "$ucddir" \
59c67d6573Sopenharmony_ci    --chars > "$out/perl_word.rs"
60c67d6573Sopenharmony_ciucd-generate general-category "$ucddir" \
61c67d6573Sopenharmony_ci    --chars --include decimalnumber > "$out/perl_decimal.rs"
62c67d6573Sopenharmony_ciucd-generate property-bool "$ucddir" \
63c67d6573Sopenharmony_ci    --chars --include whitespace > "$out/perl_space.rs"
64c67d6573Sopenharmony_ci
65c67d6573Sopenharmony_ci# Make sure everything is formatted.
66c67d6573Sopenharmony_cicargo +stable fmt --all
67