1c67d6573Sopenharmony_ci#!/bin/sh 2c67d6573Sopenharmony_ci 3c67d6573Sopenharmony_ci# This script is responsible for generating some of the Unicode tables used 4c67d6573Sopenharmony_ci# in regex-syntax. 5c67d6573Sopenharmony_ci# 6c67d6573Sopenharmony_ci# Usage is simple, first download the Unicode data: 7c67d6573Sopenharmony_ci# 8c67d6573Sopenharmony_ci# $ mkdir ucd 9c67d6573Sopenharmony_ci# $ cd ucd 10c67d6573Sopenharmony_ci# $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip 11c67d6573Sopenharmony_ci# $ unzip UCD.zip 12c67d6573Sopenharmony_ci# 13c67d6573Sopenharmony_ci# And then run this script from the root of this repository by pointing it at 14c67d6573Sopenharmony_ci# the data directory downloaded above: 15c67d6573Sopenharmony_ci# 16c67d6573Sopenharmony_ci# $ ./scripts/generate-unicode-tables path/to/ucd 17c67d6573Sopenharmony_ci# 18c67d6573Sopenharmony_ci# Once complete, if you are upgrading to a new version of Unicode, 19c67d6573Sopenharmony_ci# you'll need to add a new "age" value to the 'ages' routine in 20c67d6573Sopenharmony_ci# regex-syntax/src/unicode.rs. 21c67d6573Sopenharmony_ci 22c67d6573Sopenharmony_ciif [ $# != 1 ]; then 23c67d6573Sopenharmony_ci echo "Usage: $(basename "$0") <ucd-data-directory>" >&2 24c67d6573Sopenharmony_ci exit 1 25c67d6573Sopenharmony_cifi 26c67d6573Sopenharmony_ciucddir="$1" 27c67d6573Sopenharmony_ci 28c67d6573Sopenharmony_ciout="regex-syntax/src/unicode_tables" 29c67d6573Sopenharmony_ciucd-generate age "$ucddir" \ 30c67d6573Sopenharmony_ci --chars > "$out/age.rs" 31c67d6573Sopenharmony_ciucd-generate case-folding-simple "$ucddir" \ 32c67d6573Sopenharmony_ci --chars --all-pairs > "$out/case_folding_simple.rs" 33c67d6573Sopenharmony_ciucd-generate general-category "$ucddir" \ 34c67d6573Sopenharmony_ci --chars --exclude surrogate > "$out/general_category.rs" 35c67d6573Sopenharmony_ciucd-generate grapheme-cluster-break "$ucddir" \ 36c67d6573Sopenharmony_ci --chars > "$out/grapheme_cluster_break.rs" 37c67d6573Sopenharmony_ciucd-generate property-bool "$ucddir" \ 38c67d6573Sopenharmony_ci --chars > "$out/property_bool.rs" 39c67d6573Sopenharmony_ciucd-generate property-names "$ucddir" \ 40c67d6573Sopenharmony_ci > "$out/property_names.rs" 41c67d6573Sopenharmony_ciucd-generate property-values "$ucddir" \ 42c67d6573Sopenharmony_ci --include gc,script,scx,age,gcb,wb,sb > "$out/property_values.rs" 43c67d6573Sopenharmony_ciucd-generate script "$ucddir" \ 44c67d6573Sopenharmony_ci --chars > "$out/script.rs" 45c67d6573Sopenharmony_ciucd-generate script-extension "$ucddir" \ 46c67d6573Sopenharmony_ci --chars > "$out/script_extension.rs" 47c67d6573Sopenharmony_ciucd-generate sentence-break "$ucddir" \ 48c67d6573Sopenharmony_ci --chars > "$out/sentence_break.rs" 49c67d6573Sopenharmony_ciucd-generate word-break "$ucddir" \ 50c67d6573Sopenharmony_ci --chars > "$out/word_break.rs" 51c67d6573Sopenharmony_ci 52c67d6573Sopenharmony_ci# These generate the \w, \d and \s Unicode-aware character classes. \d and \s 53c67d6573Sopenharmony_ci# are technically part of the general category and boolean properties generated 54c67d6573Sopenharmony_ci# above. However, these are generated separately to make it possible to enable 55c67d6573Sopenharmony_ci# or disable them via Cargo features independently of whether all boolean 56c67d6573Sopenharmony_ci# properties or general categories are enabled or disabled. The crate ensures 57c67d6573Sopenharmony_ci# that only one copy is compiled. 58c67d6573Sopenharmony_ciucd-generate perl-word "$ucddir" \ 59c67d6573Sopenharmony_ci --chars > "$out/perl_word.rs" 60c67d6573Sopenharmony_ciucd-generate general-category "$ucddir" \ 61c67d6573Sopenharmony_ci --chars --include decimalnumber > "$out/perl_decimal.rs" 62c67d6573Sopenharmony_ciucd-generate property-bool "$ucddir" \ 63c67d6573Sopenharmony_ci --chars --include whitespace > "$out/perl_space.rs" 64c67d6573Sopenharmony_ci 65c67d6573Sopenharmony_ci# Make sure everything is formatted. 66c67d6573Sopenharmony_cicargo +stable fmt --all 67