1e5c31af7Sopenharmony_ci#!/usr/bin/env python
2e5c31af7Sopenharmony_ci
3e5c31af7Sopenharmony_ci# Copyright 2020 The Amber Authors. All rights reserved.
4e5c31af7Sopenharmony_ci#
5e5c31af7Sopenharmony_ci# Licensed under the Apache License, Version 2.0 (the "License");
6e5c31af7Sopenharmony_ci# you may not use this file except in compliance with the License.
7e5c31af7Sopenharmony_ci# You may obtain a copy of the License at
8e5c31af7Sopenharmony_ci#
9e5c31af7Sopenharmony_ci#	http://www.apache.org/licenses/LICENSE-2.0
10e5c31af7Sopenharmony_ci#
11e5c31af7Sopenharmony_ci# Unless required by applicable law or agreed to in writing, software
12e5c31af7Sopenharmony_ci# distributed under the License is distributed on an "AS IS" BASIS,
13e5c31af7Sopenharmony_ci# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14e5c31af7Sopenharmony_ci# See the License for the specific language governing permissions and
15e5c31af7Sopenharmony_ci# limitations under the License.
16e5c31af7Sopenharmony_ci
17e5c31af7Sopenharmony_ci"""
18e5c31af7Sopenharmony_ciScript to check files for inclusive language. The script will scan all files
19e5c31af7Sopenharmony_ciand flag non-inclusive terminology which is identified.
20e5c31af7Sopenharmony_ci
21e5c31af7Sopenharmony_ciUsage, run the script from a folder and the script will scan down through that
22e5c31af7Sopenharmony_cifolder.
23e5c31af7Sopenharmony_ci"""
24e5c31af7Sopenharmony_ci
25e5c31af7Sopenharmony_ciimport fnmatch
26e5c31af7Sopenharmony_ciimport os
27e5c31af7Sopenharmony_ciimport re
28e5c31af7Sopenharmony_ciimport sys
29e5c31af7Sopenharmony_ci
30e5c31af7Sopenharmony_ciREGEXES = [
31e5c31af7Sopenharmony_ci	r"(?i)black[-_]?list",
32e5c31af7Sopenharmony_ci	r"(?i)white[-_]?list",
33e5c31af7Sopenharmony_ci	r"(?i)gr[ea]y[-_]?list",
34e5c31af7Sopenharmony_ci	r"(?i)(first class citizen)",
35e5c31af7Sopenharmony_ci	r"(?i)black[-_]?hat",
36e5c31af7Sopenharmony_ci	r"(?i)white[-_]?hat",
37e5c31af7Sopenharmony_ci	r"(?i)gr[ea]y[-_]?hat",
38e5c31af7Sopenharmony_ci	r"(?i)master",
39e5c31af7Sopenharmony_ci	r"(?i)slave",
40e5c31af7Sopenharmony_ci	r"(?i)\bhim\b",
41e5c31af7Sopenharmony_ci	r"(?i)\bhis\b",
42e5c31af7Sopenharmony_ci	r"(?i)\bshe\b",
43e5c31af7Sopenharmony_ci	r"(?i)\bher\b",
44e5c31af7Sopenharmony_ci	r"(?i)\bhers\b",
45e5c31af7Sopenharmony_ci	r"(?i)\bman\b",
46e5c31af7Sopenharmony_ci	r"(?i)\bwoman\b",
47e5c31af7Sopenharmony_ci	r"(?i)\she\s",
48e5c31af7Sopenharmony_ci	r"(?i)\she$",
49e5c31af7Sopenharmony_ci	r"(?i)^he\s",
50e5c31af7Sopenharmony_ci	r"(?i)^he$",
51e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]d\s",
52e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]d$",
53e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]d\s",
54e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]d$",
55e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]s\s",
56e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]s$",
57e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]s\s",
58e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]s$",
59e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]ll\s",
60e5c31af7Sopenharmony_ci	r"(?i)\she['|\u2019]ll$",
61e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]ll\s",
62e5c31af7Sopenharmony_ci	r"(?i)^he['|\u2019]ll$",
63e5c31af7Sopenharmony_ci	r"(?i)grandfather",
64e5c31af7Sopenharmony_ci	r"(?i)\bmitm\b",
65e5c31af7Sopenharmony_ci	r"(?i)\bcrazy\b",
66e5c31af7Sopenharmony_ci	r"(?i)\binsane\b",
67e5c31af7Sopenharmony_ci	r"(?i)\bblind\sto\b",
68e5c31af7Sopenharmony_ci	r"(?i)\bflying\sblind\b",
69e5c31af7Sopenharmony_ci	r"(?i)\bblind\seye\b",
70e5c31af7Sopenharmony_ci	r"(?i)\bcripple\b",
71e5c31af7Sopenharmony_ci	r"(?i)\bcrippled\b",
72e5c31af7Sopenharmony_ci	r"(?i)\bdumb\b",
73e5c31af7Sopenharmony_ci	r"(?i)\bdummy\b",
74e5c31af7Sopenharmony_ci	r"(?i)\bparanoid\b",
75e5c31af7Sopenharmony_ci	r"(?i)\bsane\b",
76e5c31af7Sopenharmony_ci	r"(?i)\bsanity\b",
77e5c31af7Sopenharmony_ci	r"(?i)red[-_]?line",
78e5c31af7Sopenharmony_ci]
79e5c31af7Sopenharmony_ci
80e5c31af7Sopenharmony_ciSUPPRESSIONS = [
81e5c31af7Sopenharmony_ci	r"(?i)MS_SLAVE",
82e5c31af7Sopenharmony_ci	r"(?i)man[ -_]?page",
83e5c31af7Sopenharmony_ci]
84e5c31af7Sopenharmony_ci
85e5c31af7Sopenharmony_ci
86e5c31af7Sopenharmony_ciREGEX_LIST = []
87e5c31af7Sopenharmony_cifor reg in REGEXES:
88e5c31af7Sopenharmony_ci	REGEX_LIST.append(re.compile(reg))
89e5c31af7Sopenharmony_ci
90e5c31af7Sopenharmony_ciSUPPRESSION_LIST = []
91e5c31af7Sopenharmony_cifor supp in SUPPRESSIONS:
92e5c31af7Sopenharmony_ci	SUPPRESSION_LIST.append(re.compile(supp))
93e5c31af7Sopenharmony_ci
94e5c31af7Sopenharmony_cidef find(top, filename_glob, skip_glob_list):
95e5c31af7Sopenharmony_ci	"""Returns files in the tree rooted at top matching filename_glob but not
96e5c31af7Sopenharmony_ci	in directories matching skip_glob_list."""
97e5c31af7Sopenharmony_ci
98e5c31af7Sopenharmony_ci	file_list = []
99e5c31af7Sopenharmony_ci	for path, dirs, files in os.walk(top):
100e5c31af7Sopenharmony_ci		for glob in skip_glob_list:
101e5c31af7Sopenharmony_ci			for match in fnmatch.filter(dirs, glob):
102e5c31af7Sopenharmony_ci				dirs.remove(match)
103e5c31af7Sopenharmony_ci		for filename in fnmatch.filter(files, filename_glob):
104e5c31af7Sopenharmony_ci			if filename == os.path.basename(__file__):
105e5c31af7Sopenharmony_ci				continue
106e5c31af7Sopenharmony_ci			file_list.append(os.path.join(path, filename))
107e5c31af7Sopenharmony_ci	return file_list
108e5c31af7Sopenharmony_ci
109e5c31af7Sopenharmony_ci
110e5c31af7Sopenharmony_cidef filtered_descendants(glob):
111e5c31af7Sopenharmony_ci	"""Returns glob-matching filenames under the current directory, but skips
112e5c31af7Sopenharmony_ci	some irrelevant paths."""
113e5c31af7Sopenharmony_ci	return find('.', glob, ['third_party', 'external', 'build*', 'out*',
114e5c31af7Sopenharmony_ci							'CompilerIdCXX', '.git'])
115e5c31af7Sopenharmony_ci
116e5c31af7Sopenharmony_cidef check_match(filename, contents):
117e5c31af7Sopenharmony_ci	"""Check if contents contains any matching entries"""
118e5c31af7Sopenharmony_ci	ret = False
119e5c31af7Sopenharmony_ci	for reg in REGEX_LIST:
120e5c31af7Sopenharmony_ci		match = reg.search(contents)
121e5c31af7Sopenharmony_ci		if match:
122e5c31af7Sopenharmony_ci			suppressed = False
123e5c31af7Sopenharmony_ci			for supp in SUPPRESSION_LIST:
124e5c31af7Sopenharmony_ci				idx = match.start()
125e5c31af7Sopenharmony_ci				supp_match = supp.match(contents[idx:])
126e5c31af7Sopenharmony_ci				if supp_match:
127e5c31af7Sopenharmony_ci					suppressed = True
128e5c31af7Sopenharmony_ci
129e5c31af7Sopenharmony_ci				# This is a hack to handle the MS_ prefix that is needed
130e5c31af7Sopenharmony_ci				# to check for. Find a better way if we get more suppressions
131e5c31af7Sopenharmony_ci				# which modify the prefix of the string
132e5c31af7Sopenharmony_ci				if idx >= 3:
133e5c31af7Sopenharmony_ci					supp_match = supp.match(contents[idx - 3:])
134e5c31af7Sopenharmony_ci					if supp_match:
135e5c31af7Sopenharmony_ci						suppressed = True
136e5c31af7Sopenharmony_ci
137e5c31af7Sopenharmony_ci			if not suppressed:
138e5c31af7Sopenharmony_ci				# No matching suppression.
139e5c31af7Sopenharmony_ci				print("{}: found non-inclusive language: {}".format(
140e5c31af7Sopenharmony_ci						filename, match.group(0)))
141e5c31af7Sopenharmony_ci				ret = True
142e5c31af7Sopenharmony_ci
143e5c31af7Sopenharmony_ci	return ret
144e5c31af7Sopenharmony_ci
145e5c31af7Sopenharmony_ci
146e5c31af7Sopenharmony_cidef alert_if_lang_matches(glob):
147e5c31af7Sopenharmony_ci	"""Prints names of all files matching non-inclusive language.
148e5c31af7Sopenharmony_ci
149e5c31af7Sopenharmony_ci	Finds all glob-matching files under the current directory and checks if they
150e5c31af7Sopenharmony_ci	contain the language pattern.  Prints the names of all the files that
151e5c31af7Sopenharmony_ci	match.
152e5c31af7Sopenharmony_ci
153e5c31af7Sopenharmony_ci	Returns the total number of file names printed.
154e5c31af7Sopenharmony_ci	"""
155e5c31af7Sopenharmony_ci	verbose = False
156e5c31af7Sopenharmony_ci	printed_count = 0
157e5c31af7Sopenharmony_ci	for file in filtered_descendants(glob):
158e5c31af7Sopenharmony_ci		has_match = False
159e5c31af7Sopenharmony_ci		try:
160e5c31af7Sopenharmony_ci			with open(file, 'r', encoding='utf8') as contents:
161e5c31af7Sopenharmony_ci				if check_match(file, contents.read()):
162e5c31af7Sopenharmony_ci					printed_count += 1
163e5c31af7Sopenharmony_ci		except:
164e5c31af7Sopenharmony_ci			if verbose:
165e5c31af7Sopenharmony_ci				print("skipping {}".format(file))
166e5c31af7Sopenharmony_ci
167e5c31af7Sopenharmony_ci	return printed_count
168e5c31af7Sopenharmony_ci
169e5c31af7Sopenharmony_ci
170e5c31af7Sopenharmony_cidef main():
171e5c31af7Sopenharmony_ci	globs = ['*']
172e5c31af7Sopenharmony_ci	count = 0
173e5c31af7Sopenharmony_ci	for glob in globs:
174e5c31af7Sopenharmony_ci		count += alert_if_lang_matches(glob)
175e5c31af7Sopenharmony_ci
176e5c31af7Sopenharmony_ci	sys.exit(count > 0)
177e5c31af7Sopenharmony_ci
178e5c31af7Sopenharmony_ciif __name__ == '__main__':
179e5c31af7Sopenharmony_ci	main()
180