binary-dockerfile-model / scripts /03_fetch_github.py
LeeSek's picture
Add scripts
e9b8340 verified
# 03_fetch_github.py
# Pobieranie repozytoriów z Dockerfile – v4
# Użycie: python scripts/03_fetch_github.py --queries -1 --limit 500 --min_stars 3 --refresh --include_popular
import argparse
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime
# === Konfiguracja tematów i języków
LANGUAGES = [
"python", "node", "go", "java", "rust", "php",
"ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift"
]
TOPICS = [
"backend", "frontend", "production", "testing", "ci",
"ml", "devops", "containers", "docker", "cloud", "microservices"
]
GENERAL = [
"dockerfile", "docker container", "docker base image",
"multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer",
"dockerfile ubuntu", "dockerfile alpine", "dockerfile debian"
]
DEFAULT_QUERIES = [
"dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript",
"dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala",
"dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask",
"dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot",
"dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte",
"dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql",
"dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache",
"dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices",
"dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface",
"dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd",
"dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter",
"dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience",
"dockerfile databricks", "dockerfile github-actions", "dockerfile codequality"
]
SPECIAL_QUERIES = [
"dockerfile base image", "dockerfile ci", "dockerfile cicd",
"dockerfile templates", "dockerfile registry", "dockerfile minimal",
"dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow",
"dockerfile production ready", "dockerfile examples", "dockerfile secure",
"dockerfile dotnet", "dockerfile rust", "dockerfile slim image",
"dockerfile cloud native", "dockerfile init", "dockerfile test image"
]
DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json")
DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json")
DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json")
def generate_queries():
queries = set()
queries.update(GENERAL)
queries.update(DEFAULT_QUERIES)
queries.update(SPECIAL_QUERIES)
for lang in LANGUAGES:
for topic in TOPICS:
queries.add(f"dockerfile {lang} {topic}")
return sorted(queries)
def run_query(query, limit):
print(f"🔍 Szukam: {query}")
result = subprocess.run([
"gh", "search", "repos", query,
"--limit", str(limit),
"--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url"
], capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Błąd zapytania: {result.stderr.strip()}")
return []
try:
data = json.loads(result.stdout)
if not data:
print(f"⚠️ Brak wyników dla: {query}")
return data
except Exception as e:
print(f"❌ Błąd JSON: {e}")
return []
def deduplicate_and_filter(repos, min_stars, min_date):
seen = set()
filtered = []
for r in repos:
name = r["fullName"]
updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d")
if name in seen:
continue
if r["stargazersCount"] < min_stars:
continue
if updated < min_date:
continue
seen.add(name)
filtered.append(r)
return filtered
def load_manual_popular_repos(path):
if not path.exists():
print(f"⚠️ Brak pliku: {path}")
return []
with open(path, "r") as f:
try:
data = json.load(f)
enriched = []
for r in data:
enriched.append({
"fullName": r["fullName"],
"url": r.get("url", ""),
"description": r.get("description", ""),
"stargazersCount": r.get("stargazersCount", 9999),
"updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"),
"createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"),
"pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z")
})
return enriched
except Exception as e:
print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}")
return []
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW)
parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED)
parser.add_argument("--queries", type=int, default=-1)
parser.add_argument("--limit", type=int, default=100)
parser.add_argument("--min_stars", type=int, default=5)
parser.add_argument("--min_date", type=str, default="2021-01-01")
parser.add_argument("--refresh", action="store_true")
parser.add_argument("--include_popular", action="store_true")
parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS)
args = parser.parse_args()
args.raw_output.parent.mkdir(parents=True, exist_ok=True)
args.filtered_output.parent.mkdir(parents=True, exist_ok=True)
min_date = datetime.strptime(args.min_date, "%Y-%m-%d")
if args.raw_output.exists() and not args.refresh:
print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.")
return
all_queries = generate_queries()
queries = all_queries if args.queries == -1 else all_queries[:args.queries]
print(f"🧠 Wygenerowano {len(queries)} zapytań:")
for q in queries:
print(" •", q)
all_results = []
for idx, query in enumerate(queries, 1):
print(f"\n🔄 [{idx}/{len(queries)}]")
results = run_query(query, args.limit)
all_results.extend(results)
time.sleep(5)
if args.include_popular:
print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}")
all_results.extend(load_manual_popular_repos(args.popular_file))
print(f"\n📈 Łącznie zapytań: {len(queries)}")
print(f"📦 Surowych wyników: {len(all_results)}")
with open(args.raw_output, "w") as f:
json.dump(all_results, f, indent=2)
clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date)
with open(args.filtered_output, "w") as f:
json.dump(clean_repos, f, indent=2)
print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów")
print(f"📁 Zapisano do: {args.filtered_output}")
if __name__ == "__main__":
main()