File size: 9,315 Bytes
e7ee502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import os
import pandas as pd
import json
from typing import List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders import CSVLoader, JSONLoader
import kaggle

class KaggleDataLoader:
    """Load and process Kaggle datasets for RAG."""
    
    def __init__(self, kaggle_username: Optional[str] = None, kaggle_key: Optional[str] = None):
        """
        Initialize Kaggle loader.
        
        Args:
            kaggle_username: Your Kaggle username (optional if using kaggle.json)
            kaggle_key: Your Kaggle API key (optional if using kaggle.json)
        """
        self.kaggle_username = kaggle_username
        self.kaggle_key = kaggle_key
        
        # Try to load credentials from kaggle.json first
        self._load_kaggle_credentials()
        
        # Set Kaggle credentials (either from kaggle.json or parameters)
        if self.kaggle_username and self.kaggle_key:
            os.environ['KAGGLE_USERNAME'] = self.kaggle_username
            os.environ['KAGGLE_KEY'] = self.kaggle_key
            print("Kaggle credentials loaded successfully")
        else:
            print("Warning: No Kaggle credentials found. Please set up kaggle.json or provide credentials.")
    
    def _load_kaggle_credentials(self):
        """Load Kaggle credentials from kaggle.json file."""
        # Common locations for kaggle.json
        possible_paths = [
            os.path.expanduser("~/.kaggle/kaggle.json"),
            os.path.expanduser("~/kaggle.json"),
            "./kaggle.json",
            os.path.join(os.getcwd(), "kaggle.json")
        ]
        
        for path in possible_paths:
            if os.path.exists(path):
                try:
                    with open(path, 'r') as f:
                        credentials = json.load(f)
                    
                    # Extract username and key from kaggle.json
                    if 'username' in credentials and 'key' in credentials:
                        self.kaggle_username = credentials['username']
                        self.kaggle_key = credentials['key']
                        print(f"Loaded Kaggle credentials from {path}")
                        return
                    else:
                        print(f"Invalid kaggle.json format at {path}. Expected 'username' and 'key' fields.")
                        
                except Exception as e:
                    print(f"Error reading kaggle.json from {path}: {e}")
        
        print("No valid kaggle.json found in common locations:")
        for path in possible_paths:
            print(f"  - {path}")
        print("Please create kaggle.json with your Kaggle API credentials.")
    
    def download_dataset(self, dataset_name: str, download_path: str = "./data") -> str:
        """
        Download a Kaggle dataset.
        
        Args:
            dataset_name: Dataset name in format 'username/dataset-name'
            download_path: Where to save the dataset
            
        Returns:
            Path to downloaded dataset
        """
        if not self.kaggle_username or not self.kaggle_key:
            raise ValueError("Kaggle credentials not found. Please set up kaggle.json or provide credentials.")
        
        try:
            # Create a unique directory for this dataset
            dataset_dir = dataset_name.replace('/', '_')
            full_download_path = os.path.join(download_path, dataset_dir)
            
            # Create the directory if it doesn't exist
            os.makedirs(full_download_path, exist_ok=True)
            
            kaggle.api.authenticate()
            kaggle.api.dataset_download_files(dataset_name, path=full_download_path, unzip=True)
            print(f"Dataset {dataset_name} downloaded successfully to {full_download_path}")
            return full_download_path
        except Exception as e:
            print(f"Error downloading dataset: {e}")
            raise
    
    def load_csv_dataset(self, file_path: str, text_columns: List[str], chunk_size: int = 100) -> List[Document]:
        """Load documents from a CSV file."""
        try:
            df = pd.read_csv(file_path)
            documents = []
            
            # For FAQ datasets, try to combine question and answer columns
            if 'Questions' in df.columns and 'Answers' in df.columns:
                print(f"Processing FAQ dataset with {len(df)} Q&A pairs")
                for idx, row in df.iterrows():
                    question = str(row['Questions']).strip()
                    answer = str(row['Answers']).strip()
                    
                    # Create a document with question prominently featured for better retrieval
                    content = f"QUESTION: {question}\n\nANSWER: {answer}"
                    documents.append(Document(
                        page_content=content,
                        metadata={"source": file_path, "type": "faq", "question_id": idx, "question": question}
                    ))
            else:
                # Fallback to original method for other CSV files
                print(f"Processing regular CSV with columns: {text_columns}")
                for idx, row in df.iterrows():
                    # Combine specified text columns
                    text_parts = []
                    for col in text_columns:
                        if col in df.columns and pd.notna(row[col]):
                            text_parts.append(str(row[col]).strip())
                    
                    if text_parts:
                        content = " ".join(text_parts)
                        documents.append(Document(
                            page_content=content,
                            metadata={"source": file_path, "row": idx}
                        ))
            
            print(f"Created {len(documents)} documents from CSV")
            return documents
            
        except Exception as e:
            print(f"Error loading CSV dataset: {e}")
            return []
    
    def load_json_dataset(self, file_path: str, text_field: str = "text",
                         metadata_fields: Optional[List[str]] = None) -> List[Document]:
        """
        Load JSON data and convert to documents.
        
        Args:
            file_path: Path to JSON file
            text_field: Field name containing the main text
            metadata_fields: Fields to include as metadata
            
        Returns:
            List of Document objects
        """
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        documents = []
        
        for item in data:
            text_content = item.get(text_field, "")
            
            # Create metadata
            metadata = {"source": file_path}
            if metadata_fields:
                for field in metadata_fields:
                    if field in item:
                        metadata[field] = item[field]
            
            documents.append(Document(
                page_content=text_content,
                metadata=metadata
            ))
        
        return documents
    
    def load_text_dataset(self, file_path: str, chunk_size: int = 1000) -> List[Document]:
        """
        Load plain text data and convert to documents.
        
        Args:
            file_path: Path to text file
            chunk_size: Number of characters per document
            
        Returns:
            List of Document objects
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        documents = []
        
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i+chunk_size]
            
            documents.append(Document(
                page_content=chunk,
                metadata={
                    "source": file_path,
                    "chunk_id": i // chunk_size,
                    "start_char": i,
                    "end_char": min(i + chunk_size, len(text))
                }
            ))
        
        return documents

# Example usage functions
def load_kaggle_csv_example():
    """Example: Load a CSV dataset from Kaggle."""
    # Initialize loader (replace with your credentials)
    loader = KaggleDataLoader("your_username", "your_api_key")
    
    # Download dataset (example: COVID-19 dataset)
    dataset_path = loader.download_dataset("gpreda/covid-world-vaccination-progress")
    
    # Load CSV data
    csv_file = os.path.join(dataset_path, "country_vaccinations.csv")
    documents = loader.load_csv_dataset(
        csv_file,
        text_columns=["country", "vaccines", "source_name"],
        chunk_size=100
    )
    
    return documents

def load_kaggle_json_example():
    """Example: Load a JSON dataset from Kaggle."""
    loader = KaggleDataLoader("your_username", "your_api_key")
    
    # Download dataset (example: news articles)
    dataset_path = loader.download_dataset("rmisra/news-category-dataset")
    
    # Load JSON data
    json_file = os.path.join(dataset_path, "News_Category_Dataset_v3.json")
    documents = loader.load_json_dataset(
        json_file,
        text_field="headline",
        metadata_fields=["category", "date"]
    )
    
    return documents