KnowledgeBridge / tests /integration /test-enhanced-validation.js
fazeel007's picture
initial commit
7c012de
#!/usr/bin/env node
// Test enhanced URL validation specifically for ArXiv and other problematic URLs
async function testEnhancedValidation() {
console.log('πŸ” Testing Enhanced URL Validation...\n');
const testUrls = [
// Valid ArXiv URLs
'https://arxiv.org/abs/2001.08361', // Real paper
'https://arxiv.org/abs/1706.03762', // Attention is All You Need
// Invalid ArXiv URLs (the problematic ones)
'https://arxiv.org/abs/2024.rag.advances', // Invalid format
'https://arxiv.org/abs/2024.fake.paper', // Invalid format
'https://arxiv.org/abs/9999.99999', // Non-existent paper
// Other problematic URLs
'https://vldb.org/vector-db-2024', // 404 page
'https://cvpr.org', // Unreachable
];
console.log('πŸ§ͺ Testing individual URLs with enhanced validation...\n');
for (const url of testUrls) {
try {
console.log(`Testing: ${url}`);
// Simulate the validation logic
const urlObj = new URL(url);
if (urlObj.hostname.includes('arxiv.org')) {
// Test ArXiv validation
const match = url.match(/arxiv\.org\/abs\/(.+)$/);
if (match) {
const paperId = match[1];
console.log(` ArXiv ID: ${paperId}`);
// Check format
const validFormats = [
/^\d{4}\.\d{4,5}$/, // New format: 2024.12345
/^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
];
const hasValidFormat = validFormats.some(regex => regex.test(paperId));
console.log(` Format valid: ${hasValidFormat}`);
if (!hasValidFormat) {
console.log(` Result: ❌ INVALID (bad format)`);
console.log('');
continue;
}
}
}
// Test actual URL
const response = await fetch(url, {
method: 'GET',
signal: AbortSignal.timeout(5000),
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (Enhanced Validator)'
}
});
console.log(` Status: ${response.status}`);
if (!response.ok) {
console.log(` Result: ❌ INVALID (${response.status})`);
} else {
// Check content for errors
const content = await response.text();
const errorIndicators = [
'not recognized',
'might instead try to search',
'article identifier',
'not found',
'error'
];
const hasError = errorIndicators.some(indicator =>
content.toLowerCase().includes(indicator.toLowerCase())
);
if (hasError) {
console.log(` Content: Contains error messages`);
console.log(` Result: ❌ INVALID (error content)`);
} else {
console.log(` Content: Valid`);
console.log(` Result: βœ… VALID`);
}
}
} catch (error) {
console.log(` Error: ${error.message}`);
console.log(` Result: ❌ INVALID (network error)`);
}
console.log('');
}
console.log('πŸ”Ž Testing search with enhanced validation...\n');
// Test the search endpoint to see if problematic URLs are filtered
try {
const response = await fetch('http://localhost:5000/api/search', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
query: 'rag',
searchType: 'semantic',
limit: 10
})
});
if (response.ok) {
const data = await response.json();
console.log(`Search for "rag" returned ${data.results.length} results:`);
data.results.forEach((result, index) => {
console.log(`${index + 1}. ${result.title}`);
console.log(` URL: ${result.url}`);
// Check if this is the problematic ArXiv URL
if (result.url.includes('2024.rag.advances')) {
console.log(` ⚠️ This should have been filtered out!`);
} else {
console.log(` βœ… Valid URL`);
}
console.log('');
});
} else {
console.log('❌ Search request failed');
}
} catch (error) {
console.log('❌ Search test failed:', error.message);
}
console.log('🎯 Enhanced Validation Test Complete!');
}
testEnhancedValidation();