1import os
2import json
3import argparse
4import logging
5from typing import List, Dict, Optional
6
7REQUIRED_FIELDS = [
8    "Name",
9    "License",
10    "License File",
11    "Version Number",
12    "Owner",
13    "Upstream URL",
14    "Description"
15]
16
17class OpenSourceValidator:
18    def __init__(
19        self,
20        project_root: str,
21        log_file: Optional[str] = None,
22        reference_data: Optional[List[Dict[str, str]]] = None
23    ):
24        self.project_root = project_root
25        self.reference_data = reference_data or []
26        self.log_file = log_file
27
28        # 设置日志配置
29        if self.log_file:
30            logging.basicConfig(
31                filename=self.log_file,
32                level=logging.INFO,
33                format="%(asctime)s - %(levelname)s - %(message)s",
34            )
35        else:
36            logging.basicConfig(
37                level=logging.INFO,
38                format="%(asctime)s - %(levelname)s - %(message)s",
39            )
40
41    def find_all_readmes(self) -> List[str]:
42        """递归查找所有 README.OpenSource 文件"""
43        readme_paths = []
44        for dirpath, _, filenames in os.walk(self.project_root):
45            if "README.OpenSource" in filenames:
46                readme_paths.append(os.path.join(dirpath, "README.OpenSource"))
47        return readme_paths
48
49    def validate_format(self, readme_path: str) -> bool:
50        """验证 README.OpenSource 文件的格式和必需字段"""
51        errors = []
52        try:
53            with open(readme_path, 'r', encoding='utf-8') as f:
54                data = json.load(f)
55            if not isinstance(data, list):
56                errors.append("The file does not contain a JSON array.")
57                return False
58            for idx, component in enumerate(data):
59                for field in REQUIRED_FIELDS:
60                    if field not in component:
61                        errors.append(f"Component {idx + 1} is missing required field: {field}")
62        except json.JSONDecodeError as e:
63            errors.append(f"JSON decode error: {e}")
64            return False
65        except Exception as e:
66            errors.append(f"Unexpected error: {e}")
67            return False
68
69        if errors:
70            for error in errors:
71                logging.error(f"{readme_path}: {error}")
72            return False
73        else:
74            logging.info(f"{readme_path} format is valid.")
75            return True
76
77    def load_reference_data(self, reference_data_path: str):
78        """从 JSON 配置文件中加载参考数据"""
79        try:
80            with open(reference_data_path, "r", encoding='utf-8') as f:
81                self.reference_data = json.load(f)
82        except Exception as e:
83            raise ValueError(
84                f"Failed to load reference data from {reference_data_path}: {e}"
85            )
86
87    def find_reference_data(self, name: str) -> Optional[Dict[str, str]]:
88        """在参考数据中根据名称查找对应的开源软件信息"""
89        for reference in self.reference_data:
90            if reference.get("Name") == name:
91                return reference
92        return None
93
94    def validate_content(self, readme_path: str) -> bool:
95        """校验 README.OpenSource 文件的内容,并与参考数据进行比对"""
96        # Step 1: 读取 JSON 文件
97        try:
98            with open(readme_path, "r", encoding='utf-8') as f:
99                readme_data = json.load(f)
100            if not isinstance(readme_data, list):
101                logging.error(f"{readme_path}: JSON data is not an array.")
102                return False
103        except json.JSONDecodeError as e:
104            logging.error(f"{readme_path}: JSON decode error: {e}")
105            return False
106
107        # Step 2: 校验 JSON 数组中的每个开源软件元数据
108        all_valid = True
109        for software_data in readme_data:
110            name = software_data.get("Name")
111            if not name:
112                logging.error(f"{readme_path}: Missing 'Name' field in software data.")
113                all_valid = False
114                continue
115
116            reference_data = self.find_reference_data(name)
117
118            if reference_data is None:
119                logging.error(
120                    f"{readme_path}: Software '{name}' not found in reference data."
121                )
122                all_valid = False
123                continue
124
125            # 比对 "Name", "License", "Version Number", "Upstream URL"
126            for field in ["Name", "License", "Version Number", "Upstream URL"]:
127                expected_value = reference_data.get(field)
128                actual_value = software_data.get(field)
129                if actual_value != expected_value:
130                    logging.error(
131                        f"{readme_path}: Field '{field}' mismatch for '{name}'. Expected: '{expected_value}', Found: '{actual_value}'"
132                    )
133                    all_valid = False
134
135            # 校验 "License File" 路径是否存在
136            if not self.validate_license_file(readme_path, software_data.get("License File")):
137                all_valid = False
138
139        if all_valid:
140            logging.info(f"{readme_path}: Content validation passed.")
141        else:
142            logging.error(f"{readme_path}: Content validation failed.")
143        return all_valid
144
145    def validate_license_file(self, readme_path: str, license_file: str) -> bool:
146        """校验 LICENSE 文件是否存在,路径相对于 README.OpenSource 文件所在目录"""
147        if not license_file:
148            logging.error(f"{readme_path}: 'License File' field is missing.")
149            return False
150
151        readme_dir = os.path.dirname(readme_path)
152        license_file_path = os.path.join(readme_dir, license_file)
153
154        if not os.path.exists(license_file_path):
155            logging.error(
156                f"{readme_path}: License file '{license_file}' not found at: {license_file_path}"
157            )
158            return False
159        else:
160            logging.info(f"{readme_path}: License file '{license_file}' exists.")
161            return True
162
163    def run_validation(self, validate_format: bool = True, validate_content: bool = False):
164        """运行完整的校验流程,递归处理所有 README.OpenSource 文件"""
165        try:
166            readme_paths = self.find_all_readmes()
167            if not readme_paths:
168                logging.error("No README.OpenSource files found in the project directory.")
169                return
170
171            for readme_path in readme_paths:
172                logging.info(f"Validating: {readme_path}")
173                if validate_format:
174                    if not self.validate_format(readme_path):
175                        logging.error(f"{readme_path}: Format validation failed.")
176                        continue  # 如果格式验证失败,跳过内容验证
177                if validate_content:
178                    if not self.validate_content(readme_path):
179                        logging.error(f"{readme_path}: Content validation failed.")
180
181            logging.info("Validation process completed.")
182
183        except Exception as e:
184            logging.error(f"Validation failed: {e}")
185
186
187def main():
188    parser = argparse.ArgumentParser(
189        description="Validate README.OpenSource files in a project."
190    )
191    parser.add_argument("project_root", help="The root directory of the project.")
192    parser.add_argument(
193        "--validate-format", action='store_true', help="Validate the format of README.OpenSource files."
194    )
195    parser.add_argument(
196        "--validate-content", action='store_true', help="Validate the content of README.OpenSource files against reference data."
197    )
198    parser.add_argument(
199        "--reference-data", help="Path to the reference data JSON file (required for content validation)."
200    )
201    parser.add_argument("--log-file", help="Path to the log file for validation results.")
202
203    args = parser.parse_args()
204
205    if args.validate_content and not args.reference_data:
206        parser.error("--reference-data is required for content validation.")
207
208    # 初始化验证器对象
209    validator = OpenSourceValidator(
210        project_root=args.project_root,
211        log_file=args.log_file
212    )
213
214    if args.validate_content:
215        # 从配置文件中加载参考数据
216        validator.load_reference_data(args.reference_data)
217
218    # 执行校验流程
219    validator.run_validation(
220        validate_format=args.validate_format or not (args.validate_format or args.validate_content),
221        validate_content=args.validate_content
222    )
223
224
225if __name__ == "__main__":
226    main()
227
228