Files
ossm-configurator/scripts/generate_manifest_from_site.py

328 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Generate vendor manifest from site component JSON files.
Scans /src/data/components/*.json for printedParts entries with GitHub URLs
and creates or updates manifest/vendor_manifest.json.
"""
import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional, Any
from urllib.parse import urlparse, parse_qs, unquote
def parse_github_url(url: str) -> Optional[Dict[str, str]]:
"""
Parse GitHub URL to extract owner, repo, path, and ref.
Supports:
- https://github.com/owner/repo/blob/<ref>/path/to/file
- https://github.com/owner/repo/raw/<ref>/path/to/file
- https://raw.githubusercontent.com/owner/repo/<ref>/path/to/file
"""
if not url or not isinstance(url, str):
return None
# Check if it's a GitHub URL
if 'github.com' not in url:
return None
# Handle raw.githubusercontent.com
if 'raw.githubusercontent.com' in url:
match = re.match(r'https://raw\.githubusercontent\.com/([^/]+)/([^/]+)/([^/]+)/(.+)', url)
if match:
owner, repo, ref, path = match.groups()
return {
'owner': owner,
'repo': repo,
'ref': ref,
'path': unquote(path).split('?')[0] # Remove query params
}
# Handle github.com URLs
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 5:
return None
owner = path_parts[0]
repo = path_parts[1]
mode = path_parts[2] # 'blob' or 'raw'
ref = path_parts[3]
# Get file path (everything after ref)
file_path = '/'.join(path_parts[4:])
# Remove query params from path
file_path = unquote(file_path).split('?')[0]
# Handle ?raw=true in query params (sometimes used with blob URLs)
query_params = parse_qs(parsed.query)
if 'raw' in query_params or mode == 'raw':
return {
'owner': owner,
'repo': repo,
'ref': ref,
'path': file_path
}
return None
def find_printed_parts(data: Any, path: str = '') -> List[Dict[str, Any]]:
"""
Recursively find all printedParts entries in nested JSON structure.
Returns list of (part_dict, json_file_path, part_id) tuples.
"""
parts = []
if isinstance(data, dict):
# Check if this dict has a 'printedParts' key
if 'printedParts' in data:
for part in data['printedParts']:
if isinstance(part, dict) and 'id' in part:
parts.append({
'part': part,
'json_path': path,
'part_id': part.get('id')
})
# Also check for 'bodyParts', 'knobs', etc. that might contain parts
for key in ['bodyParts', 'knobs']:
if key in data and isinstance(data[key], list):
for part in data[key]:
if isinstance(part, dict) and 'id' in part:
parts.append({
'part': part,
'json_path': path,
'part_id': part.get('id')
})
# Recursively search nested structures
for key, value in data.items():
if isinstance(value, (dict, list)):
parts.extend(find_printed_parts(value, path))
elif isinstance(data, list):
for item in data:
parts.extend(find_printed_parts(item, path))
return parts
def generate_manifest_id(part_id: str, owner: str, repo: str, path: str) -> str:
"""Generate a manifest ID from part ID or create one from repo/path."""
if part_id:
return part_id
# Generate slug from owner-repo-path
slug = f"{owner}-{repo}-{path.replace('/', '-').replace(' ', '-')}"
# Remove special chars
slug = re.sub(r'[^a-zA-Z0-9_-]', '', slug)
return slug[:100] # Limit length
def generate_local_path(owner: str, repo: str, path: str) -> str:
"""Generate local vendor path from owner, repo, and file path."""
repo_dir = f"{owner}-{repo}"
return f"vendor/{repo_dir}/{path}"
def load_existing_manifest(manifest_path: Path) -> Dict[str, Dict]:
"""Load existing manifest or return empty dict."""
if manifest_path.exists():
try:
with open(manifest_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Convert list to dict keyed by id
if isinstance(data, list):
return {entry['id']: entry for entry in data}
elif isinstance(data, dict) and 'entries' in data:
return {entry['id']: entry for entry in data['entries']}
elif isinstance(data, dict):
# Assume it's already keyed by id
return data
except (json.JSONDecodeError, KeyError) as e:
print(f"Warning: Could not parse existing manifest: {e}", file=sys.stderr)
return {}
def scan_component_files(site_dir: Path, repo_root: Path) -> List[Dict[str, Any]]:
"""Scan all component JSON files and extract printedParts with GitHub URLs."""
entries = []
if not site_dir.exists():
print(f"Error: Site directory does not exist: {site_dir}", file=sys.stderr)
return entries
for json_file in site_dir.glob('*.json'):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
parts = find_printed_parts(data, str(json_file))
for item in parts:
part = item['part']
url = part.get('url')
if not url:
continue
github_info = parse_github_url(url)
if not github_info:
print(f"Warning: Skipping non-GitHub URL in {json_file}: {url}", file=sys.stderr)
continue
part_id = item['part_id']
manifest_id = generate_manifest_id(
part_id,
github_info['owner'],
github_info['repo'],
github_info['path']
)
local_path = generate_local_path(
github_info['owner'],
github_info['repo'],
github_info['path']
)
# Store relative path from repo root
try:
json_file_rel = json_file.relative_to(repo_root)
except ValueError:
# If not relative, use absolute path
json_file_rel = json_file
entries.append({
'manifest_id': manifest_id,
'part_id': part_id,
'part': part,
'json_file': str(json_file_rel),
'github_info': github_info,
'local_path': local_path
})
except (json.JSONDecodeError, IOError) as e:
print(f"Warning: Could not read {json_file}: {e}", file=sys.stderr)
continue
return entries
def create_or_update_manifest_entry(
existing_entry: Optional[Dict],
new_data: Dict[str, Any]
) -> Dict[str, Any]:
"""Create new manifest entry or merge with existing."""
github_info = new_data['github_info']
manifest_id = new_data['manifest_id']
if existing_entry:
# Merge: keep existing pinned data, update source info if changed
entry = existing_entry.copy()
entry['source_repo'] = f"{github_info['owner']}/{github_info['repo']}"
entry['source_path'] = github_info['path']
entry['source_ref'] = github_info.get('ref', 'main')
entry['local_path'] = new_data['local_path']
entry['orig_site_json'] = new_data['json_file']
entry['orig_item_id'] = new_data['part_id']
# Don't overwrite pinned_sha, checksum, etc. if they exist
return entry
# Create new entry
return {
'id': manifest_id,
'source_repo': f"{github_info['owner']}/{github_info['repo']}",
'source_path': github_info['path'],
'source_ref': github_info.get('ref', 'main'),
'pinned_sha': None,
'pinned_raw_url': None,
'local_path': new_data['local_path'],
'checksum_sha256': None,
'last_checked': None,
'upstream_latest_sha': None,
'status': 'unknown',
'license': None,
'orig_site_json': new_data['json_file'],
'orig_item_id': new_data['part_id']
}
def main():
parser = argparse.ArgumentParser(
description='Generate vendor manifest from site component JSON files'
)
parser.add_argument(
'--site-dir',
type=Path,
default=Path('website/src/data/components'),
help='Directory containing component JSON files (default: website/src/data/components)'
)
parser.add_argument(
'--manifest',
type=Path,
default=Path('manifest/vendor_manifest.json'),
help='Path to manifest file (default: manifest/vendor_manifest.json)'
)
args = parser.parse_args()
# Resolve paths relative to script location or current directory
script_dir = Path(__file__).parent.parent
site_dir = (script_dir / args.site_dir).resolve()
manifest_path = (script_dir / args.manifest).resolve()
# Ensure manifest directory exists
manifest_path.parent.mkdir(parents=True, exist_ok=True)
# Load existing manifest
existing_manifest = load_existing_manifest(manifest_path)
# Scan component files
print(f"Scanning component files in {site_dir}...")
entries = scan_component_files(site_dir, repo_root=script_dir)
if not entries:
print("No GitHub URLs found in component files.", file=sys.stderr)
sys.exit(1)
# Create or update manifest entries
updated_manifest = existing_manifest.copy()
for entry_data in entries:
manifest_id = entry_data['manifest_id']
existing_entry = updated_manifest.get(manifest_id)
new_entry = create_or_update_manifest_entry(existing_entry, entry_data)
updated_manifest[manifest_id] = new_entry
# Convert to sorted list for deterministic output
manifest_list = sorted(updated_manifest.values(), key=lambda x: x['id'])
# Write manifest
print(f"Writing manifest to {manifest_path}...")
with open(manifest_path, 'w', encoding='utf-8') as f:
json.dump(manifest_list, f, indent=2, sort_keys=False)
print(f"Generated {len(manifest_list)} manifest entries.")
# Show summary
new_entries = len(manifest_list) - len(existing_manifest)
if new_entries > 0:
print(f"Added {new_entries} new entries.")
if len(existing_manifest) > 0:
print(f"Updated {len(existing_manifest)} existing entries.")
if __name__ == '__main__':
main()