binary-dockerfile-model / scripts /03_fetch_github.py

Add scripts

e9b8340 verified 24 days ago

7.25 kB

	# 03_fetch_github.py
	# Pobieranie repozytoriów z Dockerfile – v4
	# Użycie: python scripts/03_fetch_github.py --queries -1 --limit 500 --min_stars 3 --refresh --include_popular

	import argparse
	import json
	import subprocess
	import time
	from pathlib import Path
	from datetime import datetime

	# === Konfiguracja tematów i języków
	LANGUAGES = [
	"python", "node", "go", "java", "rust", "php",
	"ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift"
	]
	TOPICS = [
	"backend", "frontend", "production", "testing", "ci",
	"ml", "devops", "containers", "docker", "cloud", "microservices"
	]
	GENERAL = [
	"dockerfile", "docker container", "docker base image",
	"multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer",
	"dockerfile ubuntu", "dockerfile alpine", "dockerfile debian"
	]
	DEFAULT_QUERIES = [
	"dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript",
	"dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala",
	"dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask",
	"dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot",
	"dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte",
	"dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql",
	"dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache",
	"dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices",
	"dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface",
	"dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd",
	"dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter",
	"dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience",
	"dockerfile databricks", "dockerfile github-actions", "dockerfile codequality"
	]
	SPECIAL_QUERIES = [
	"dockerfile base image", "dockerfile ci", "dockerfile cicd",
	"dockerfile templates", "dockerfile registry", "dockerfile minimal",
	"dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow",
	"dockerfile production ready", "dockerfile examples", "dockerfile secure",
	"dockerfile dotnet", "dockerfile rust", "dockerfile slim image",
	"dockerfile cloud native", "dockerfile init", "dockerfile test image"
	]

	DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json")
	DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json")
	DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json")

	def generate_queries():
	queries = set()
	queries.update(GENERAL)
	queries.update(DEFAULT_QUERIES)
	queries.update(SPECIAL_QUERIES)

	for lang in LANGUAGES:
	for topic in TOPICS:
	queries.add(f"dockerfile {lang} {topic}")

	return sorted(queries)

	def run_query(query, limit):
	print(f"🔍 Szukam: {query}")
	result = subprocess.run([
	"gh", "search", "repos", query,
	"--limit", str(limit),
	"--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url"
	], capture_output=True, text=True)

	if result.returncode != 0:
	print(f"❌ Błąd zapytania: {result.stderr.strip()}")
	return []

	try:
	data = json.loads(result.stdout)
	if not data:
	print(f"⚠️ Brak wyników dla: {query}")
	return data
	except Exception as e:
	print(f"❌ Błąd JSON: {e}")
	return []

	def deduplicate_and_filter(repos, min_stars, min_date):
	seen = set()
	filtered = []
	for r in repos:
	name = r["fullName"]
	updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d")
	if name in seen:
	continue
	if r["stargazersCount"] < min_stars:
	continue
	if updated < min_date:
	continue
	seen.add(name)
	filtered.append(r)
	return filtered

	def load_manual_popular_repos(path):
	if not path.exists():
	print(f"⚠️ Brak pliku: {path}")
	return []

	with open(path, "r") as f:
	try:
	data = json.load(f)
	enriched = []
	for r in data:
	enriched.append({
	"fullName": r["fullName"],
	"url": r.get("url", ""),
	"description": r.get("description", ""),
	"stargazersCount": r.get("stargazersCount", 9999),
	"updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"),
	"createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"),
	"pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z")
	})
	return enriched
	except Exception as e:
	print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}")
	return []

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW)
	parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED)
	parser.add_argument("--queries", type=int, default=-1)
	parser.add_argument("--limit", type=int, default=100)
	parser.add_argument("--min_stars", type=int, default=5)
	parser.add_argument("--min_date", type=str, default="2021-01-01")
	parser.add_argument("--refresh", action="store_true")
	parser.add_argument("--include_popular", action="store_true")
	parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS)
	args = parser.parse_args()

	args.raw_output.parent.mkdir(parents=True, exist_ok=True)
	args.filtered_output.parent.mkdir(parents=True, exist_ok=True)
	min_date = datetime.strptime(args.min_date, "%Y-%m-%d")

	if args.raw_output.exists() and not args.refresh:
	print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.")
	return

	all_queries = generate_queries()
	queries = all_queries if args.queries == -1 else all_queries[:args.queries]

	print(f"🧠 Wygenerowano {len(queries)} zapytań:")
	for q in queries:
	print(" •", q)

	all_results = []
	for idx, query in enumerate(queries, 1):
	print(f"\n🔄 [{idx}/{len(queries)}]")
	results = run_query(query, args.limit)
	all_results.extend(results)
	time.sleep(5)

	if args.include_popular:
	print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}")
	all_results.extend(load_manual_popular_repos(args.popular_file))

	print(f"\n📈 Łącznie zapytań: {len(queries)}")
	print(f"📦 Surowych wyników: {len(all_results)}")
	with open(args.raw_output, "w") as f:
	json.dump(all_results, f, indent=2)

	clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date)
	with open(args.filtered_output, "w") as f:
	json.dump(clean_repos, f, indent=2)

	print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów")
	print(f"📁 Zapisano do: {args.filtered_output}")

	if __name__ == "__main__":
	main()