Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Language Detector | |
This module provides functionality for detecting programming languages in a repository. | |
""" | |
import os | |
import logging | |
from collections import Counter | |
logger = logging.getLogger(__name__) | |
# File extension to language mapping | |
EXTENSION_TO_LANGUAGE = { | |
'.py': 'Python', | |
'.js': 'JavaScript', | |
'.jsx': 'JavaScript', | |
'.ts': 'TypeScript', | |
'.tsx': 'TypeScript', | |
'.java': 'Java', | |
'.go': 'Go', | |
'.rs': 'Rust', | |
'.cpp': 'C++', | |
'.cc': 'C++', | |
'.cxx': 'C++', | |
'.c': 'C', | |
'.h': 'C', | |
'.hpp': 'C++', | |
'.cs': 'C#', | |
'.php': 'PHP', | |
'.rb': 'Ruby', | |
'.swift': 'Swift', | |
'.kt': 'Kotlin', | |
'.scala': 'Scala', | |
'.r': 'R', | |
'.sh': 'Shell', | |
'.bash': 'Shell', | |
'.zsh': 'Shell', | |
'.html': 'HTML', | |
'.htm': 'HTML', | |
'.css': 'CSS', | |
'.scss': 'SCSS', | |
'.sass': 'SCSS', | |
'.less': 'Less', | |
'.md': 'Markdown', | |
'.json': 'JSON', | |
'.xml': 'XML', | |
'.yaml': 'YAML', | |
'.yml': 'YAML', | |
'.sql': 'SQL', | |
'.graphql': 'GraphQL', | |
'.gql': 'GraphQL', | |
} | |
# Special files to language mapping | |
SPECIAL_FILES_TO_LANGUAGE = { | |
'Dockerfile': 'Docker', | |
'docker-compose.yml': 'Docker', | |
'docker-compose.yaml': 'Docker', | |
'Makefile': 'Make', | |
'CMakeLists.txt': 'CMake', | |
'package.json': 'JavaScript', | |
'tsconfig.json': 'TypeScript', | |
'requirements.txt': 'Python', | |
'setup.py': 'Python', | |
'pom.xml': 'Java', | |
'build.gradle': 'Java', | |
'Cargo.toml': 'Rust', | |
'go.mod': 'Go', | |
} | |
class LanguageDetector: | |
""" | |
Detects programming languages in a repository. | |
""" | |
def __init__(self): | |
""" | |
Initialize the LanguageDetector. | |
""" | |
logger.info("Initialized LanguageDetector") | |
def detect_languages(self, repo_path): | |
""" | |
Detect programming languages in a repository. | |
Args: | |
repo_path (str): The path to the repository. | |
Returns: | |
list: A list of detected programming languages, sorted by prevalence. | |
""" | |
logger.info(f"Detecting languages in repository: {repo_path}") | |
language_counter = Counter() | |
for root, dirs, files in os.walk(repo_path): | |
# Skip hidden directories and common non-code directories | |
dirs[:] = [d for d in dirs if not d.startswith('.') and | |
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] | |
for file in files: | |
file_path = os.path.join(root, file) | |
# Check if it's a special file | |
if file in SPECIAL_FILES_TO_LANGUAGE: | |
language = SPECIAL_FILES_TO_LANGUAGE[file] | |
language_counter[language] += 1 | |
continue | |
# Check file extension | |
_, ext = os.path.splitext(file) | |
if ext in EXTENSION_TO_LANGUAGE: | |
language = EXTENSION_TO_LANGUAGE[ext] | |
language_counter[language] += 1 | |
# Get the top languages (limit to supported languages) | |
supported_languages = [ | |
"Python", "JavaScript", "TypeScript", "Java", | |
"Go", "Rust", "C++", "C#", "PHP", "Ruby", | |
"Swift", "Kotlin", "Scala", "R", "Shell" | |
] | |
detected_languages = [lang for lang, _ in language_counter.most_common() | |
if lang in supported_languages] | |
logger.info(f"Detected languages: {detected_languages}") | |
return detected_languages | |
def get_language_breakdown(self, repo_path): | |
""" | |
Get a breakdown of programming languages in a repository by lines of code. | |
Args: | |
repo_path (str): The path to the repository. | |
Returns: | |
dict: A dictionary mapping languages to lines of code. | |
""" | |
logger.info(f"Getting language breakdown for repository: {repo_path}") | |
language_loc = {} | |
for root, dirs, files in os.walk(repo_path): | |
# Skip hidden directories and common non-code directories | |
dirs[:] = [d for d in dirs if not d.startswith('.') and | |
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] | |
for file in files: | |
file_path = os.path.join(root, file) | |
# Determine the language | |
language = None | |
# Check if it's a special file | |
if file in SPECIAL_FILES_TO_LANGUAGE: | |
language = SPECIAL_FILES_TO_LANGUAGE[file] | |
else: | |
# Check file extension | |
_, ext = os.path.splitext(file) | |
if ext in EXTENSION_TO_LANGUAGE: | |
language = EXTENSION_TO_LANGUAGE[ext] | |
if language: | |
# Count lines of code | |
try: | |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
line_count = sum(1 for _ in f) | |
if language in language_loc: | |
language_loc[language] += line_count | |
else: | |
language_loc[language] = line_count | |
except Exception as e: | |
logger.warning(f"Error counting lines in {file_path}: {e}") | |
logger.info(f"Language breakdown: {language_loc}") | |
return language_loc |