#!/usr/bin/env node // Test enhanced URL validation specifically for ArXiv and other problematic URLs async function testEnhancedValidation() { console.log('๐Ÿ” Testing Enhanced URL Validation...\n'); const testUrls = [ // Valid ArXiv URLs 'https://arxiv.org/abs/2001.08361', // Real paper 'https://arxiv.org/abs/1706.03762', // Attention is All You Need // Invalid ArXiv URLs (the problematic ones) 'https://arxiv.org/abs/2024.rag.advances', // Invalid format 'https://arxiv.org/abs/2024.fake.paper', // Invalid format 'https://arxiv.org/abs/9999.99999', // Non-existent paper // Other problematic URLs 'https://vldb.org/vector-db-2024', // 404 page 'https://cvpr.org', // Unreachable ]; console.log('๐Ÿงช Testing individual URLs with enhanced validation...\n'); for (const url of testUrls) { try { console.log(`Testing: ${url}`); // Simulate the validation logic const urlObj = new URL(url); if (urlObj.hostname.includes('arxiv.org')) { // Test ArXiv validation const match = url.match(/arxiv\.org\/abs\/(.+)$/); if (match) { const paperId = match[1]; console.log(` ArXiv ID: ${paperId}`); // Check format const validFormats = [ /^\d{4}\.\d{4,5}$/, // New format: 2024.12345 /^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567 ]; const hasValidFormat = validFormats.some(regex => regex.test(paperId)); console.log(` Format valid: ${hasValidFormat}`); if (!hasValidFormat) { console.log(` Result: โŒ INVALID (bad format)`); console.log(''); continue; } } } // Test actual URL const response = await fetch(url, { method: 'GET', signal: AbortSignal.timeout(5000), headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0 (Enhanced Validator)' } }); console.log(` Status: ${response.status}`); if (!response.ok) { console.log(` Result: โŒ INVALID (${response.status})`); } else { // Check content for errors const content = await response.text(); const errorIndicators = [ 'not recognized', 'might instead try to search', 'article identifier', 'not found', 'error' ]; const hasError = errorIndicators.some(indicator => content.toLowerCase().includes(indicator.toLowerCase()) ); if (hasError) { console.log(` Content: Contains error messages`); console.log(` Result: โŒ INVALID (error content)`); } else { console.log(` Content: Valid`); console.log(` Result: โœ… VALID`); } } } catch (error) { console.log(` Error: ${error.message}`); console.log(` Result: โŒ INVALID (network error)`); } console.log(''); } console.log('๐Ÿ”Ž Testing search with enhanced validation...\n'); // Test the search endpoint to see if problematic URLs are filtered try { const response = await fetch('http://localhost:5000/api/search', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ query: 'rag', searchType: 'semantic', limit: 10 }) }); if (response.ok) { const data = await response.json(); console.log(`Search for "rag" returned ${data.results.length} results:`); data.results.forEach((result, index) => { console.log(`${index + 1}. ${result.title}`); console.log(` URL: ${result.url}`); // Check if this is the problematic ArXiv URL if (result.url.includes('2024.rag.advances')) { console.log(` โš ๏ธ This should have been filtered out!`); } else { console.log(` โœ… Valid URL`); } console.log(''); }); } else { console.log('โŒ Search request failed'); } } catch (error) { console.log('โŒ Search test failed:', error.message); } console.log('๐ŸŽฏ Enhanced Validation Test Complete!'); } testEnhancedValidation();