brendon-ai commited on
Commit
b2f9df1
·
verified ·
1 Parent(s): e7ee502

Delete kaggle_loader.py

Browse files
Files changed (1) hide show
  1. kaggle_loader.py +0 -240
kaggle_loader.py DELETED
@@ -1,240 +0,0 @@
1
- import os
2
- import pandas as pd
3
- import json
4
- from typing import List, Optional
5
- from langchain_core.documents import Document
6
- from langchain_community.document_loaders import CSVLoader, JSONLoader
7
- import kaggle
8
-
9
- class KaggleDataLoader:
10
- """Load and process Kaggle datasets for RAG."""
11
-
12
- def __init__(self, kaggle_username: Optional[str] = None, kaggle_key: Optional[str] = None):
13
- """
14
- Initialize Kaggle loader.
15
-
16
- Args:
17
- kaggle_username: Your Kaggle username (optional if using kaggle.json)
18
- kaggle_key: Your Kaggle API key (optional if using kaggle.json)
19
- """
20
- self.kaggle_username = kaggle_username
21
- self.kaggle_key = kaggle_key
22
-
23
- # Try to load credentials from kaggle.json first
24
- self._load_kaggle_credentials()
25
-
26
- # Set Kaggle credentials (either from kaggle.json or parameters)
27
- if self.kaggle_username and self.kaggle_key:
28
- os.environ['KAGGLE_USERNAME'] = self.kaggle_username
29
- os.environ['KAGGLE_KEY'] = self.kaggle_key
30
- print("Kaggle credentials loaded successfully")
31
- else:
32
- print("Warning: No Kaggle credentials found. Please set up kaggle.json or provide credentials.")
33
-
34
- def _load_kaggle_credentials(self):
35
- """Load Kaggle credentials from kaggle.json file."""
36
- # Common locations for kaggle.json
37
- possible_paths = [
38
- os.path.expanduser("~/.kaggle/kaggle.json"),
39
- os.path.expanduser("~/kaggle.json"),
40
- "./kaggle.json",
41
- os.path.join(os.getcwd(), "kaggle.json")
42
- ]
43
-
44
- for path in possible_paths:
45
- if os.path.exists(path):
46
- try:
47
- with open(path, 'r') as f:
48
- credentials = json.load(f)
49
-
50
- # Extract username and key from kaggle.json
51
- if 'username' in credentials and 'key' in credentials:
52
- self.kaggle_username = credentials['username']
53
- self.kaggle_key = credentials['key']
54
- print(f"Loaded Kaggle credentials from {path}")
55
- return
56
- else:
57
- print(f"Invalid kaggle.json format at {path}. Expected 'username' and 'key' fields.")
58
-
59
- except Exception as e:
60
- print(f"Error reading kaggle.json from {path}: {e}")
61
-
62
- print("No valid kaggle.json found in common locations:")
63
- for path in possible_paths:
64
- print(f" - {path}")
65
- print("Please create kaggle.json with your Kaggle API credentials.")
66
-
67
- def download_dataset(self, dataset_name: str, download_path: str = "./data") -> str:
68
- """
69
- Download a Kaggle dataset.
70
-
71
- Args:
72
- dataset_name: Dataset name in format 'username/dataset-name'
73
- download_path: Where to save the dataset
74
-
75
- Returns:
76
- Path to downloaded dataset
77
- """
78
- if not self.kaggle_username or not self.kaggle_key:
79
- raise ValueError("Kaggle credentials not found. Please set up kaggle.json or provide credentials.")
80
-
81
- try:
82
- # Create a unique directory for this dataset
83
- dataset_dir = dataset_name.replace('/', '_')
84
- full_download_path = os.path.join(download_path, dataset_dir)
85
-
86
- # Create the directory if it doesn't exist
87
- os.makedirs(full_download_path, exist_ok=True)
88
-
89
- kaggle.api.authenticate()
90
- kaggle.api.dataset_download_files(dataset_name, path=full_download_path, unzip=True)
91
- print(f"Dataset {dataset_name} downloaded successfully to {full_download_path}")
92
- return full_download_path
93
- except Exception as e:
94
- print(f"Error downloading dataset: {e}")
95
- raise
96
-
97
- def load_csv_dataset(self, file_path: str, text_columns: List[str], chunk_size: int = 100) -> List[Document]:
98
- """Load documents from a CSV file."""
99
- try:
100
- df = pd.read_csv(file_path)
101
- documents = []
102
-
103
- # For FAQ datasets, try to combine question and answer columns
104
- if 'Questions' in df.columns and 'Answers' in df.columns:
105
- print(f"Processing FAQ dataset with {len(df)} Q&A pairs")
106
- for idx, row in df.iterrows():
107
- question = str(row['Questions']).strip()
108
- answer = str(row['Answers']).strip()
109
-
110
- # Create a document with question prominently featured for better retrieval
111
- content = f"QUESTION: {question}\n\nANSWER: {answer}"
112
- documents.append(Document(
113
- page_content=content,
114
- metadata={"source": file_path, "type": "faq", "question_id": idx, "question": question}
115
- ))
116
- else:
117
- # Fallback to original method for other CSV files
118
- print(f"Processing regular CSV with columns: {text_columns}")
119
- for idx, row in df.iterrows():
120
- # Combine specified text columns
121
- text_parts = []
122
- for col in text_columns:
123
- if col in df.columns and pd.notna(row[col]):
124
- text_parts.append(str(row[col]).strip())
125
-
126
- if text_parts:
127
- content = " ".join(text_parts)
128
- documents.append(Document(
129
- page_content=content,
130
- metadata={"source": file_path, "row": idx}
131
- ))
132
-
133
- print(f"Created {len(documents)} documents from CSV")
134
- return documents
135
-
136
- except Exception as e:
137
- print(f"Error loading CSV dataset: {e}")
138
- return []
139
-
140
- def load_json_dataset(self, file_path: str, text_field: str = "text",
141
- metadata_fields: Optional[List[str]] = None) -> List[Document]:
142
- """
143
- Load JSON data and convert to documents.
144
-
145
- Args:
146
- file_path: Path to JSON file
147
- text_field: Field name containing the main text
148
- metadata_fields: Fields to include as metadata
149
-
150
- Returns:
151
- List of Document objects
152
- """
153
- with open(file_path, 'r') as f:
154
- data = json.load(f)
155
-
156
- documents = []
157
-
158
- for item in data:
159
- text_content = item.get(text_field, "")
160
-
161
- # Create metadata
162
- metadata = {"source": file_path}
163
- if metadata_fields:
164
- for field in metadata_fields:
165
- if field in item:
166
- metadata[field] = item[field]
167
-
168
- documents.append(Document(
169
- page_content=text_content,
170
- metadata=metadata
171
- ))
172
-
173
- return documents
174
-
175
- def load_text_dataset(self, file_path: str, chunk_size: int = 1000) -> List[Document]:
176
- """
177
- Load plain text data and convert to documents.
178
-
179
- Args:
180
- file_path: Path to text file
181
- chunk_size: Number of characters per document
182
-
183
- Returns:
184
- List of Document objects
185
- """
186
- with open(file_path, 'r', encoding='utf-8') as f:
187
- text = f.read()
188
-
189
- documents = []
190
-
191
- for i in range(0, len(text), chunk_size):
192
- chunk = text[i:i+chunk_size]
193
-
194
- documents.append(Document(
195
- page_content=chunk,
196
- metadata={
197
- "source": file_path,
198
- "chunk_id": i // chunk_size,
199
- "start_char": i,
200
- "end_char": min(i + chunk_size, len(text))
201
- }
202
- ))
203
-
204
- return documents
205
-
206
- # Example usage functions
207
- def load_kaggle_csv_example():
208
- """Example: Load a CSV dataset from Kaggle."""
209
- # Initialize loader (replace with your credentials)
210
- loader = KaggleDataLoader("your_username", "your_api_key")
211
-
212
- # Download dataset (example: COVID-19 dataset)
213
- dataset_path = loader.download_dataset("gpreda/covid-world-vaccination-progress")
214
-
215
- # Load CSV data
216
- csv_file = os.path.join(dataset_path, "country_vaccinations.csv")
217
- documents = loader.load_csv_dataset(
218
- csv_file,
219
- text_columns=["country", "vaccines", "source_name"],
220
- chunk_size=100
221
- )
222
-
223
- return documents
224
-
225
- def load_kaggle_json_example():
226
- """Example: Load a JSON dataset from Kaggle."""
227
- loader = KaggleDataLoader("your_username", "your_api_key")
228
-
229
- # Download dataset (example: news articles)
230
- dataset_path = loader.download_dataset("rmisra/news-category-dataset")
231
-
232
- # Load JSON data
233
- json_file = os.path.join(dataset_path, "News_Category_Dataset_v3.json")
234
- documents = loader.load_json_dataset(
235
- json_file,
236
- text_field="headline",
237
- metadata_fields=["category", "date"]
238
- )
239
-
240
- return documents