# 03_fetch_github.py # Pobieranie repozytoriów z Dockerfile – v4 # Użycie: python scripts/03_fetch_github.py --queries -1 --limit 500 --min_stars 3 --refresh --include_popular import argparse import json import subprocess import time from pathlib import Path from datetime import datetime # === Konfiguracja tematów i języków LANGUAGES = [ "python", "node", "go", "java", "rust", "php", "ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift" ] TOPICS = [ "backend", "frontend", "production", "testing", "ci", "ml", "devops", "containers", "docker", "cloud", "microservices" ] GENERAL = [ "dockerfile", "docker container", "docker base image", "multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer", "dockerfile ubuntu", "dockerfile alpine", "dockerfile debian" ] DEFAULT_QUERIES = [ "dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript", "dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala", "dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask", "dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot", "dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte", "dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql", "dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache", "dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices", "dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface", "dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd", "dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter", "dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience", "dockerfile databricks", "dockerfile github-actions", "dockerfile codequality" ] SPECIAL_QUERIES = [ "dockerfile base image", "dockerfile ci", "dockerfile cicd", "dockerfile templates", "dockerfile registry", "dockerfile minimal", "dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow", "dockerfile production ready", "dockerfile examples", "dockerfile secure", "dockerfile dotnet", "dockerfile rust", "dockerfile slim image", "dockerfile cloud native", "dockerfile init", "dockerfile test image" ] DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json") DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json") DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json") def generate_queries(): queries = set() queries.update(GENERAL) queries.update(DEFAULT_QUERIES) queries.update(SPECIAL_QUERIES) for lang in LANGUAGES: for topic in TOPICS: queries.add(f"dockerfile {lang} {topic}") return sorted(queries) def run_query(query, limit): print(f"🔍 Szukam: {query}") result = subprocess.run([ "gh", "search", "repos", query, "--limit", str(limit), "--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url" ], capture_output=True, text=True) if result.returncode != 0: print(f"❌ Błąd zapytania: {result.stderr.strip()}") return [] try: data = json.loads(result.stdout) if not data: print(f"⚠️ Brak wyników dla: {query}") return data except Exception as e: print(f"❌ Błąd JSON: {e}") return [] def deduplicate_and_filter(repos, min_stars, min_date): seen = set() filtered = [] for r in repos: name = r["fullName"] updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d") if name in seen: continue if r["stargazersCount"] < min_stars: continue if updated < min_date: continue seen.add(name) filtered.append(r) return filtered def load_manual_popular_repos(path): if not path.exists(): print(f"⚠️ Brak pliku: {path}") return [] with open(path, "r") as f: try: data = json.load(f) enriched = [] for r in data: enriched.append({ "fullName": r["fullName"], "url": r.get("url", ""), "description": r.get("description", ""), "stargazersCount": r.get("stargazersCount", 9999), "updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"), "createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"), "pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z") }) return enriched except Exception as e: print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}") return [] def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW) parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED) parser.add_argument("--queries", type=int, default=-1) parser.add_argument("--limit", type=int, default=100) parser.add_argument("--min_stars", type=int, default=5) parser.add_argument("--min_date", type=str, default="2021-01-01") parser.add_argument("--refresh", action="store_true") parser.add_argument("--include_popular", action="store_true") parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS) args = parser.parse_args() args.raw_output.parent.mkdir(parents=True, exist_ok=True) args.filtered_output.parent.mkdir(parents=True, exist_ok=True) min_date = datetime.strptime(args.min_date, "%Y-%m-%d") if args.raw_output.exists() and not args.refresh: print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.") return all_queries = generate_queries() queries = all_queries if args.queries == -1 else all_queries[:args.queries] print(f"🧠 Wygenerowano {len(queries)} zapytań:") for q in queries: print(" •", q) all_results = [] for idx, query in enumerate(queries, 1): print(f"\n🔄 [{idx}/{len(queries)}]") results = run_query(query, args.limit) all_results.extend(results) time.sleep(5) if args.include_popular: print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}") all_results.extend(load_manual_popular_repos(args.popular_file)) print(f"\n📈 Łącznie zapytań: {len(queries)}") print(f"📦 Surowych wyników: {len(all_results)}") with open(args.raw_output, "w") as f: json.dump(all_results, f, indent=2) clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date) with open(args.filtered_output, "w") as f: json.dump(clean_repos, f, indent=2) print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów") print(f"📁 Zapisano do: {args.filtered_output}") if __name__ == "__main__": main()