File size: 8,751 Bytes
7c012de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""
Test suite for Knowledge Base Browser Gradio Component
"""

import pytest
import json
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch

from kb_browser.retriever import KnowledgeRetriever
from kb_browser import KnowledgeBrowser


class TestKnowledgeRetriever:
    """Test cases for the KnowledgeRetriever class"""
    
    def setup_method(self):
        """Setup test environment"""
        self.temp_dir = tempfile.mkdtemp()
        self.retriever = KnowledgeRetriever(index_path=self.temp_dir)
    
    def test_initialization(self):
        """Test retriever initialization"""
        assert self.retriever is not None
        assert len(self.retriever.documents) > 0
        assert self.retriever.index_path == Path(self.temp_dir)
    
    def test_text_search_functionality(self):
        """Test text-based search fallback"""
        results = self.retriever.search(
            query="retrieval augmented generation",
            search_type="keyword",
            k=3
        )
        
        assert "documents" in results
        assert "search_time" in results
        assert "query" in results
        assert "total_count" in results
        
        assert results["query"] == "retrieval augmented generation"
        assert results["total_count"] >= 0
        assert len(results["documents"]) <= 3
        
        if results["documents"]:
            doc = results["documents"][0]
            assert "id" in doc
            assert "title" in doc
            assert "content" in doc
            assert "snippet" in doc
            assert "relevance_score" in doc
    
    def test_semantic_search_with_openai(self):
        """Test semantic search with OpenAI embeddings"""
        # This will use the actual OpenAI API if available
        results = self.retriever.search(
            query="vector databases",
            search_type="semantic",
            k=2
        )
        
        assert results["total_count"] >= 0
        assert len(results["documents"]) <= 2
    
    def test_snippet_extraction(self):
        """Test snippet extraction functionality"""
        content = "This is a long document about retrieval augmented generation and vector databases."
        query = "retrieval"
        
        snippet = self.retriever._extract_snippet(content, query, max_length=50)
        assert "retrieval" in snippet.lower()
        assert len(snippet) <= 60  # Accounting for ellipsis
    
    def test_text_scoring(self):
        """Test text relevance scoring"""
        doc = {
            "title": "Retrieval Augmented Generation",
            "content": "This document discusses RAG and retrieval methods."
        }
        
        score = self.retriever._calculate_text_score(doc, "retrieval")
        assert 0 <= score <= 1
        assert score > 0  # Should match the word "retrieval"


class TestKnowledgeBrowser:
    """Test cases for the KnowledgeBrowser Gradio component"""
    
    def setup_method(self):
        """Setup test environment"""
        self.temp_dir = tempfile.mkdtemp()
        self.kb_browser = KnowledgeBrowser(index_path=self.temp_dir)
    
    def test_component_initialization(self):
        """Test component initialization"""
        assert self.kb_browser is not None
        assert self.kb_browser.query == ""
        assert self.kb_browser.results == []
        assert self.kb_browser.search_type == "semantic"
        assert self.kb_browser.max_results == 10
    
    def test_preprocess_method(self):
        """Test payload preprocessing"""
        payload = {
            "query": "test query",
            "search_type": "hybrid",
            "max_results": 5
        }
        
        processed = self.kb_browser.preprocess(payload)
        
        assert processed["query"] == "test query"
        assert processed["search_type"] == "hybrid"
        assert processed["max_results"] == 5
        assert "filters" in processed
    
    def test_postprocess_method(self):
        """Test value postprocessing"""
        value = {
            "query": "test query",
            "results": [{"title": "Test Doc", "snippet": "Test content"}],
            "search_type": "semantic",
            "total_count": 1,
            "search_time": 0.1
        }
        
        processed = self.kb_browser.postprocess(value)
        
        assert processed["query"] == "test query"
        assert len(processed["results"]) == 1
        assert processed["search_type"] == "semantic"
        assert processed["total_count"] == 1
        assert processed["search_time"] == 0.1
    
    def test_api_info(self):
        """Test API information structure"""
        api_info = self.kb_browser.api_info()
        
        assert "info" in api_info
        assert "type" in api_info["info"]
        assert "properties" in api_info["info"]
        
        properties = api_info["info"]["properties"]
        assert "query" in properties
        assert "results" in properties
        assert "search_type" in properties
    
    def test_example_inputs(self):
        """Test example inputs"""
        examples = self.kb_browser.example_inputs()
        
        assert "query" in examples
        assert "search_type" in examples
        assert "max_results" in examples
        
        assert examples["query"] == "retrieval augmented generation"
        assert examples["search_type"] == "semantic"
        assert examples["max_results"] == 5
    
    def test_search_method(self):
        """Test component search functionality"""
        results = self.kb_browser.search(
            query="vector search",
            search_type="semantic",
            max_results=3
        )
        
        assert "query" in results
        assert "results" in results
        assert "search_type" in results
        assert "total_count" in results
        assert "search_time" in results
        
        assert results["query"] == "vector search"
        assert results["search_type"] == "semantic"
        assert len(results["results"]) <= 3


class TestIntegration:
    """Integration tests for the complete system"""
    
    def test_end_to_end_search(self):
        """Test complete search workflow"""
        kb_browser = KnowledgeBrowser()
        
        # Perform search
        results = kb_browser.search("LlamaIndex", search_type="semantic", max_results=2)
        
        # Verify structure
        assert isinstance(results, dict)
        assert "documents" in results or "results" in results
        assert "search_time" in results
        
        # Verify content if results exist
        documents = results.get("documents") or results.get("results", [])
        if documents:
            doc = documents[0]
            assert "title" in doc
            assert "snippet" in doc
            assert "relevance_score" in doc
    
    @patch('kb_browser.retriever.LLAMA_INDEX_AVAILABLE', False)
    def test_fallback_when_llama_index_unavailable(self):
        """Test system falls back gracefully when LlamaIndex is unavailable"""
        retriever = KnowledgeRetriever()
        results = retriever.search("test query", k=1)
        
        assert "documents" in results
        assert results["total_count"] >= 0


def test_sample_data_integrity():
    """Test that sample data is properly structured"""
    retriever = KnowledgeRetriever()
    
    for doc in retriever.documents:
        assert "id" in doc
        assert "title" in doc
        assert "content" in doc
        assert "source" in doc
        assert "source_type" in doc
        
        # Verify required fields are non-empty
        assert doc["title"].strip()
        assert doc["content"].strip()
        assert doc["source"].strip()
        assert doc["source_type"] in ["pdf", "web", "academic", "code"]


def run_manual_tests():
    """Run manual tests for development"""
    print("Running manual tests...")
    
    # Test retriever
    print("\n1. Testing KnowledgeRetriever...")
    retriever = KnowledgeRetriever()
    results = retriever.search("RAG", k=2)
    print(f"   Found {results['total_count']} results in {results['search_time']:.3f}s")
    
    # Test component
    print("\n2. Testing KnowledgeBrowser component...")
    kb_browser = KnowledgeBrowser()
    search_results = kb_browser.search("vector databases", max_results=1)
    print(f"   Component search returned {len(search_results.get('results', []))} results")
    
    # Test API info
    print("\n3. Testing API info...")
    api_info = kb_browser.api_info()
    print(f"   API info has {len(api_info['info']['properties'])} properties")
    
    print("\nAll manual tests completed successfully!")


if __name__ == "__main__":
    # Run manual tests if called directly
    run_manual_tests()