|
|
|
|
|
|
|
|
|
import argparse |
|
import json |
|
import subprocess |
|
import time |
|
from pathlib import Path |
|
from datetime import datetime |
|
|
|
|
|
LANGUAGES = [ |
|
"python", "node", "go", "java", "rust", "php", |
|
"ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift" |
|
] |
|
TOPICS = [ |
|
"backend", "frontend", "production", "testing", "ci", |
|
"ml", "devops", "containers", "docker", "cloud", "microservices" |
|
] |
|
GENERAL = [ |
|
"dockerfile", "docker container", "docker base image", |
|
"multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer", |
|
"dockerfile ubuntu", "dockerfile alpine", "dockerfile debian" |
|
] |
|
DEFAULT_QUERIES = [ |
|
"dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript", |
|
"dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala", |
|
"dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask", |
|
"dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot", |
|
"dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte", |
|
"dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql", |
|
"dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache", |
|
"dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices", |
|
"dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface", |
|
"dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd", |
|
"dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter", |
|
"dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience", |
|
"dockerfile databricks", "dockerfile github-actions", "dockerfile codequality" |
|
] |
|
SPECIAL_QUERIES = [ |
|
"dockerfile base image", "dockerfile ci", "dockerfile cicd", |
|
"dockerfile templates", "dockerfile registry", "dockerfile minimal", |
|
"dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow", |
|
"dockerfile production ready", "dockerfile examples", "dockerfile secure", |
|
"dockerfile dotnet", "dockerfile rust", "dockerfile slim image", |
|
"dockerfile cloud native", "dockerfile init", "dockerfile test image" |
|
] |
|
|
|
DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json") |
|
DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json") |
|
DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json") |
|
|
|
def generate_queries(): |
|
queries = set() |
|
queries.update(GENERAL) |
|
queries.update(DEFAULT_QUERIES) |
|
queries.update(SPECIAL_QUERIES) |
|
|
|
for lang in LANGUAGES: |
|
for topic in TOPICS: |
|
queries.add(f"dockerfile {lang} {topic}") |
|
|
|
return sorted(queries) |
|
|
|
def run_query(query, limit): |
|
print(f"🔍 Szukam: {query}") |
|
result = subprocess.run([ |
|
"gh", "search", "repos", query, |
|
"--limit", str(limit), |
|
"--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url" |
|
], capture_output=True, text=True) |
|
|
|
if result.returncode != 0: |
|
print(f"❌ Błąd zapytania: {result.stderr.strip()}") |
|
return [] |
|
|
|
try: |
|
data = json.loads(result.stdout) |
|
if not data: |
|
print(f"⚠️ Brak wyników dla: {query}") |
|
return data |
|
except Exception as e: |
|
print(f"❌ Błąd JSON: {e}") |
|
return [] |
|
|
|
def deduplicate_and_filter(repos, min_stars, min_date): |
|
seen = set() |
|
filtered = [] |
|
for r in repos: |
|
name = r["fullName"] |
|
updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d") |
|
if name in seen: |
|
continue |
|
if r["stargazersCount"] < min_stars: |
|
continue |
|
if updated < min_date: |
|
continue |
|
seen.add(name) |
|
filtered.append(r) |
|
return filtered |
|
|
|
def load_manual_popular_repos(path): |
|
if not path.exists(): |
|
print(f"⚠️ Brak pliku: {path}") |
|
return [] |
|
|
|
with open(path, "r") as f: |
|
try: |
|
data = json.load(f) |
|
enriched = [] |
|
for r in data: |
|
enriched.append({ |
|
"fullName": r["fullName"], |
|
"url": r.get("url", ""), |
|
"description": r.get("description", ""), |
|
"stargazersCount": r.get("stargazersCount", 9999), |
|
"updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"), |
|
"createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"), |
|
"pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z") |
|
}) |
|
return enriched |
|
except Exception as e: |
|
print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}") |
|
return [] |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW) |
|
parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED) |
|
parser.add_argument("--queries", type=int, default=-1) |
|
parser.add_argument("--limit", type=int, default=100) |
|
parser.add_argument("--min_stars", type=int, default=5) |
|
parser.add_argument("--min_date", type=str, default="2021-01-01") |
|
parser.add_argument("--refresh", action="store_true") |
|
parser.add_argument("--include_popular", action="store_true") |
|
parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS) |
|
args = parser.parse_args() |
|
|
|
args.raw_output.parent.mkdir(parents=True, exist_ok=True) |
|
args.filtered_output.parent.mkdir(parents=True, exist_ok=True) |
|
min_date = datetime.strptime(args.min_date, "%Y-%m-%d") |
|
|
|
if args.raw_output.exists() and not args.refresh: |
|
print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.") |
|
return |
|
|
|
all_queries = generate_queries() |
|
queries = all_queries if args.queries == -1 else all_queries[:args.queries] |
|
|
|
print(f"🧠 Wygenerowano {len(queries)} zapytań:") |
|
for q in queries: |
|
print(" •", q) |
|
|
|
all_results = [] |
|
for idx, query in enumerate(queries, 1): |
|
print(f"\n🔄 [{idx}/{len(queries)}]") |
|
results = run_query(query, args.limit) |
|
all_results.extend(results) |
|
time.sleep(5) |
|
|
|
if args.include_popular: |
|
print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}") |
|
all_results.extend(load_manual_popular_repos(args.popular_file)) |
|
|
|
print(f"\n📈 Łącznie zapytań: {len(queries)}") |
|
print(f"📦 Surowych wyników: {len(all_results)}") |
|
with open(args.raw_output, "w") as f: |
|
json.dump(all_results, f, indent=2) |
|
|
|
clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date) |
|
with open(args.filtered_output, "w") as f: |
|
json.dump(clean_repos, f, indent=2) |
|
|
|
print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów") |
|
print(f"📁 Zapisano do: {args.filtered_output}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|