Spaces:
Sleeping
Sleeping
File size: 9,315 Bytes
e7ee502 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
import os
import pandas as pd
import json
from typing import List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders import CSVLoader, JSONLoader
import kaggle
class KaggleDataLoader:
"""Load and process Kaggle datasets for RAG."""
def __init__(self, kaggle_username: Optional[str] = None, kaggle_key: Optional[str] = None):
"""
Initialize Kaggle loader.
Args:
kaggle_username: Your Kaggle username (optional if using kaggle.json)
kaggle_key: Your Kaggle API key (optional if using kaggle.json)
"""
self.kaggle_username = kaggle_username
self.kaggle_key = kaggle_key
# Try to load credentials from kaggle.json first
self._load_kaggle_credentials()
# Set Kaggle credentials (either from kaggle.json or parameters)
if self.kaggle_username and self.kaggle_key:
os.environ['KAGGLE_USERNAME'] = self.kaggle_username
os.environ['KAGGLE_KEY'] = self.kaggle_key
print("Kaggle credentials loaded successfully")
else:
print("Warning: No Kaggle credentials found. Please set up kaggle.json or provide credentials.")
def _load_kaggle_credentials(self):
"""Load Kaggle credentials from kaggle.json file."""
# Common locations for kaggle.json
possible_paths = [
os.path.expanduser("~/.kaggle/kaggle.json"),
os.path.expanduser("~/kaggle.json"),
"./kaggle.json",
os.path.join(os.getcwd(), "kaggle.json")
]
for path in possible_paths:
if os.path.exists(path):
try:
with open(path, 'r') as f:
credentials = json.load(f)
# Extract username and key from kaggle.json
if 'username' in credentials and 'key' in credentials:
self.kaggle_username = credentials['username']
self.kaggle_key = credentials['key']
print(f"Loaded Kaggle credentials from {path}")
return
else:
print(f"Invalid kaggle.json format at {path}. Expected 'username' and 'key' fields.")
except Exception as e:
print(f"Error reading kaggle.json from {path}: {e}")
print("No valid kaggle.json found in common locations:")
for path in possible_paths:
print(f" - {path}")
print("Please create kaggle.json with your Kaggle API credentials.")
def download_dataset(self, dataset_name: str, download_path: str = "./data") -> str:
"""
Download a Kaggle dataset.
Args:
dataset_name: Dataset name in format 'username/dataset-name'
download_path: Where to save the dataset
Returns:
Path to downloaded dataset
"""
if not self.kaggle_username or not self.kaggle_key:
raise ValueError("Kaggle credentials not found. Please set up kaggle.json or provide credentials.")
try:
# Create a unique directory for this dataset
dataset_dir = dataset_name.replace('/', '_')
full_download_path = os.path.join(download_path, dataset_dir)
# Create the directory if it doesn't exist
os.makedirs(full_download_path, exist_ok=True)
kaggle.api.authenticate()
kaggle.api.dataset_download_files(dataset_name, path=full_download_path, unzip=True)
print(f"Dataset {dataset_name} downloaded successfully to {full_download_path}")
return full_download_path
except Exception as e:
print(f"Error downloading dataset: {e}")
raise
def load_csv_dataset(self, file_path: str, text_columns: List[str], chunk_size: int = 100) -> List[Document]:
"""Load documents from a CSV file."""
try:
df = pd.read_csv(file_path)
documents = []
# For FAQ datasets, try to combine question and answer columns
if 'Questions' in df.columns and 'Answers' in df.columns:
print(f"Processing FAQ dataset with {len(df)} Q&A pairs")
for idx, row in df.iterrows():
question = str(row['Questions']).strip()
answer = str(row['Answers']).strip()
# Create a document with question prominently featured for better retrieval
content = f"QUESTION: {question}\n\nANSWER: {answer}"
documents.append(Document(
page_content=content,
metadata={"source": file_path, "type": "faq", "question_id": idx, "question": question}
))
else:
# Fallback to original method for other CSV files
print(f"Processing regular CSV with columns: {text_columns}")
for idx, row in df.iterrows():
# Combine specified text columns
text_parts = []
for col in text_columns:
if col in df.columns and pd.notna(row[col]):
text_parts.append(str(row[col]).strip())
if text_parts:
content = " ".join(text_parts)
documents.append(Document(
page_content=content,
metadata={"source": file_path, "row": idx}
))
print(f"Created {len(documents)} documents from CSV")
return documents
except Exception as e:
print(f"Error loading CSV dataset: {e}")
return []
def load_json_dataset(self, file_path: str, text_field: str = "text",
metadata_fields: Optional[List[str]] = None) -> List[Document]:
"""
Load JSON data and convert to documents.
Args:
file_path: Path to JSON file
text_field: Field name containing the main text
metadata_fields: Fields to include as metadata
Returns:
List of Document objects
"""
with open(file_path, 'r') as f:
data = json.load(f)
documents = []
for item in data:
text_content = item.get(text_field, "")
# Create metadata
metadata = {"source": file_path}
if metadata_fields:
for field in metadata_fields:
if field in item:
metadata[field] = item[field]
documents.append(Document(
page_content=text_content,
metadata=metadata
))
return documents
def load_text_dataset(self, file_path: str, chunk_size: int = 1000) -> List[Document]:
"""
Load plain text data and convert to documents.
Args:
file_path: Path to text file
chunk_size: Number of characters per document
Returns:
List of Document objects
"""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
documents = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i+chunk_size]
documents.append(Document(
page_content=chunk,
metadata={
"source": file_path,
"chunk_id": i // chunk_size,
"start_char": i,
"end_char": min(i + chunk_size, len(text))
}
))
return documents
# Example usage functions
def load_kaggle_csv_example():
"""Example: Load a CSV dataset from Kaggle."""
# Initialize loader (replace with your credentials)
loader = KaggleDataLoader("your_username", "your_api_key")
# Download dataset (example: COVID-19 dataset)
dataset_path = loader.download_dataset("gpreda/covid-world-vaccination-progress")
# Load CSV data
csv_file = os.path.join(dataset_path, "country_vaccinations.csv")
documents = loader.load_csv_dataset(
csv_file,
text_columns=["country", "vaccines", "source_name"],
chunk_size=100
)
return documents
def load_kaggle_json_example():
"""Example: Load a JSON dataset from Kaggle."""
loader = KaggleDataLoader("your_username", "your_api_key")
# Download dataset (example: news articles)
dataset_path = loader.download_dataset("rmisra/news-category-dataset")
# Load JSON data
json_file = os.path.join(dataset_path, "News_Category_Dataset_v3.json")
documents = loader.load_json_dataset(
json_file,
text_field="headline",
metadata_fields=["category", "date"]
)
return documents |