File size: 4,416 Bytes
7c012de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
#!/usr/bin/env node
// Test enhanced URL validation specifically for ArXiv and other problematic URLs
async function testEnhancedValidation() {
console.log('π Testing Enhanced URL Validation...\n');
const testUrls = [
// Valid ArXiv URLs
'https://arxiv.org/abs/2001.08361', // Real paper
'https://arxiv.org/abs/1706.03762', // Attention is All You Need
// Invalid ArXiv URLs (the problematic ones)
'https://arxiv.org/abs/2024.rag.advances', // Invalid format
'https://arxiv.org/abs/2024.fake.paper', // Invalid format
'https://arxiv.org/abs/9999.99999', // Non-existent paper
// Other problematic URLs
'https://vldb.org/vector-db-2024', // 404 page
'https://cvpr.org', // Unreachable
];
console.log('π§ͺ Testing individual URLs with enhanced validation...\n');
for (const url of testUrls) {
try {
console.log(`Testing: ${url}`);
// Simulate the validation logic
const urlObj = new URL(url);
if (urlObj.hostname.includes('arxiv.org')) {
// Test ArXiv validation
const match = url.match(/arxiv\.org\/abs\/(.+)$/);
if (match) {
const paperId = match[1];
console.log(` ArXiv ID: ${paperId}`);
// Check format
const validFormats = [
/^\d{4}\.\d{4,5}$/, // New format: 2024.12345
/^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
];
const hasValidFormat = validFormats.some(regex => regex.test(paperId));
console.log(` Format valid: ${hasValidFormat}`);
if (!hasValidFormat) {
console.log(` Result: β INVALID (bad format)`);
console.log('');
continue;
}
}
}
// Test actual URL
const response = await fetch(url, {
method: 'GET',
signal: AbortSignal.timeout(5000),
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (Enhanced Validator)'
}
});
console.log(` Status: ${response.status}`);
if (!response.ok) {
console.log(` Result: β INVALID (${response.status})`);
} else {
// Check content for errors
const content = await response.text();
const errorIndicators = [
'not recognized',
'might instead try to search',
'article identifier',
'not found',
'error'
];
const hasError = errorIndicators.some(indicator =>
content.toLowerCase().includes(indicator.toLowerCase())
);
if (hasError) {
console.log(` Content: Contains error messages`);
console.log(` Result: β INVALID (error content)`);
} else {
console.log(` Content: Valid`);
console.log(` Result: β
VALID`);
}
}
} catch (error) {
console.log(` Error: ${error.message}`);
console.log(` Result: β INVALID (network error)`);
}
console.log('');
}
console.log('π Testing search with enhanced validation...\n');
// Test the search endpoint to see if problematic URLs are filtered
try {
const response = await fetch('http://localhost:5000/api/search', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
query: 'rag',
searchType: 'semantic',
limit: 10
})
});
if (response.ok) {
const data = await response.json();
console.log(`Search for "rag" returned ${data.results.length} results:`);
data.results.forEach((result, index) => {
console.log(`${index + 1}. ${result.title}`);
console.log(` URL: ${result.url}`);
// Check if this is the problematic ArXiv URL
if (result.url.includes('2024.rag.advances')) {
console.log(` β οΈ This should have been filtered out!`);
} else {
console.log(` β
Valid URL`);
}
console.log('');
});
} else {
console.log('β Search request failed');
}
} catch (error) {
console.log('β Search test failed:', error.message);
}
console.log('π― Enhanced Validation Test Complete!');
}
testEnhancedValidation(); |