|
import kagglehub |
|
from pathlib import Path |
|
import shutil |
|
import pandas as pd |
|
import re |
|
import nltk |
|
from typing import List, Dict |
|
from tqdm import tqdm |
|
|
|
def count_hindi_words(text: str) -> int: |
|
"""Count words in Hindi text.""" |
|
words = text.strip().split() |
|
hindi_words = [w for w in words if re.search(r'[\u0900-\u097F]', w)] |
|
return len(hindi_words) |
|
|
|
def create_dataframe_from_files(downloaded_paths: List[str]) -> pd.DataFrame: |
|
"""Create a DataFrame from downloaded text files.""" |
|
print("\nCreating DataFrame from text files...") |
|
|
|
data = [] |
|
for file_path in tqdm(downloaded_paths): |
|
if file_path.endswith('.txt'): |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
content = f.read().strip() |
|
|
|
|
|
lines = content.split('\n', 1) |
|
title = lines[0].strip() |
|
text = lines[1].strip() if len(lines) > 1 else "" |
|
|
|
data.append({ |
|
'title': title, |
|
'text': text, |
|
'word_count': count_hindi_words(content) |
|
}) |
|
except Exception as e: |
|
print(f"Error reading file {file_path}: {e}") |
|
continue |
|
|
|
df = pd.DataFrame(data) |
|
print(f"Created DataFrame with {len(df)} articles") |
|
return df |
|
|
|
def process_and_split_articles(df: pd.DataFrame, |
|
output_dir: Path, |
|
train_ratio: float = 0.8, |
|
min_words: int = 100, |
|
max_words: int = 5000) -> Dict[str, int]: |
|
"""Process articles and split them into files based on word count.""" |
|
|
|
|
|
train_dir = output_dir / "train" |
|
valid_dir = output_dir / "valid" |
|
train_dir.mkdir(exist_ok=True) |
|
valid_dir.mkdir(exist_ok=True) |
|
|
|
stats = {'train': 0, 'valid': 0, 'skipped': 0} |
|
|
|
print("\nProcessing articles...") |
|
for _, row in tqdm(df.iterrows(), total=len(df)): |
|
try: |
|
|
|
if row['word_count'] < min_words or row['word_count'] > max_words: |
|
stats['skipped'] += 1 |
|
continue |
|
|
|
|
|
full_text = f"{row['title']}\n\n{row['text']}" |
|
|
|
|
|
is_train = pd.np.random.random() < train_ratio |
|
output_dir = train_dir if is_train else valid_dir |
|
|
|
|
|
file_path = output_dir / f"{row['word_count']}.txt" |
|
suffix = 1 |
|
while file_path.exists(): |
|
file_path = output_dir / f"{row['word_count']}_{suffix}.txt" |
|
suffix += 1 |
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
f.write(full_text) |
|
|
|
if is_train: |
|
stats['train'] += 1 |
|
else: |
|
stats['valid'] += 1 |
|
|
|
except Exception as e: |
|
print(f"Error processing article: {e}") |
|
stats['skipped'] += 1 |
|
continue |
|
|
|
return stats |
|
|
|
def download_hindi_wikipedia_dataset(): |
|
"""Download and process Hindi Wikipedia dataset.""" |
|
print("Starting dataset download...") |
|
|
|
try: |
|
|
|
downloaded_paths = kagglehub.dataset_download( |
|
"disisbig/hindi-wikipedia-articles-172k" |
|
) |
|
|
|
print("Dataset downloaded successfully!") |
|
print("Downloaded files:", downloaded_paths) |
|
|
|
|
|
data_dir = Path("data") |
|
data_dir.mkdir(exist_ok=True) |
|
|
|
|
|
df = create_dataframe_from_files(downloaded_paths) |
|
|
|
|
|
df.to_parquet(data_dir / "articles.parquet") |
|
print(f"Saved DataFrame to {data_dir / 'articles.parquet'}") |
|
|
|
|
|
stats = process_and_split_articles(df, data_dir) |
|
|
|
|
|
print("\nProcessing completed:") |
|
print(f"Train files: {stats['train']}") |
|
print(f"Validation files: {stats['valid']}") |
|
print(f"Skipped articles: {stats['skipped']}") |
|
|
|
|
|
train_size = sum(f.stat().st_size for f in (data_dir / "train").glob("*.txt")) |
|
valid_size = sum(f.stat().st_size for f in (data_dir / "valid").glob("*.txt")) |
|
|
|
print(f"\nTotal size:") |
|
print(f"Train: {train_size / (1024*1024):.2f} MB") |
|
print(f"Validation: {valid_size / (1024*1024):.2f} MB") |
|
|
|
return True |
|
|
|
except Exception as e: |
|
print(f"Error downloading/processing dataset: {e}") |
|
return False |
|
|
|
def verify_dataset_structure(): |
|
"""Verify the dataset directory structure and files.""" |
|
data_dir = Path("data") |
|
|
|
if not data_dir.exists(): |
|
print("Error: Data directory not found!") |
|
return False |
|
|
|
|
|
parquet_file = data_dir / "articles.parquet" |
|
if parquet_file.exists(): |
|
df = pd.read_parquet(parquet_file) |
|
print(f"\nArticles DataFrame:") |
|
print(f"Total articles: {len(df)}") |
|
|
|
|
|
for split in ['train', 'valid']: |
|
split_dir = data_dir / split |
|
if not split_dir.exists(): |
|
print(f"Error: {split} directory not found!") |
|
return False |
|
|
|
txt_files = list(split_dir.glob("*.txt")) |
|
if not txt_files: |
|
print(f"Error: No text files found in {split} directory!") |
|
return False |
|
|
|
print(f"\n{split.upper()} split:") |
|
print(f"Number of files: {len(txt_files)}") |
|
word_counts = [int(f.stem.split('_')[0]) for f in txt_files] |
|
print(f"Word count range: {min(word_counts)} - {max(word_counts)}") |
|
|
|
return True |
|
|
|
if __name__ == "__main__": |
|
|
|
success = download_hindi_wikipedia_dataset() |
|
|
|
if success: |
|
print("\nVerifying dataset structure...") |
|
verify_dataset_structure() |