|
import { Express } from "express"; |
|
import { createServer, Server } from "http"; |
|
import { z } from "zod"; |
|
import fs from "fs"; |
|
import { storage } from "./storage"; |
|
import { searchRequestSchema } from "@shared/schema"; |
|
import { smartIngestionService } from "./smart-ingestion"; |
|
import { nebiusClient } from "./nebius-client"; |
|
import { modalClient } from "./modal-client"; |
|
import documentRoutes from "./document-routes"; |
|
import uploadFallbackRoutes from "./upload-fallback"; |
|
|
|
interface GitHubRepo { |
|
id: number; |
|
name: string; |
|
full_name: string; |
|
description: string; |
|
html_url: string; |
|
stargazers_count: number; |
|
language: string; |
|
topics: string[]; |
|
created_at: string; |
|
updated_at: string; |
|
} |
|
|
|
|
|
|
|
|
|
function cleanThinkingTags(text: string): string { |
|
if (typeof text === 'string' && text.includes('<think>')) { |
|
|
|
let cleaned = text.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); |
|
|
|
|
|
if (cleaned.includes('<think>')) { |
|
cleaned = cleaned.substring(0, cleaned.indexOf('<think>')); |
|
} |
|
|
|
return cleaned.trim(); |
|
} |
|
return text; |
|
} |
|
|
|
|
|
async function validateUrl(url: string, timeout: number = 5000): Promise<boolean> { |
|
try { |
|
console.log(`Validating URL: ${url}`); |
|
const controller = new AbortController(); |
|
const timeoutId = setTimeout(() => controller.abort(), timeout); |
|
|
|
const urlObj = new URL(url); |
|
|
|
|
|
if (urlObj.hostname.includes('arxiv.org')) { |
|
return await validateArxivUrl(url, controller.signal); |
|
} |
|
|
|
|
|
if (urlObj.hostname.includes('vldb.org') || |
|
urlObj.hostname.includes('cvpr.org') || |
|
urlObj.hostname.includes('icse.org')) { |
|
return await validateContentUrl(url, controller.signal); |
|
} |
|
|
|
|
|
const highlyTrustedDomains = [ |
|
'wikipedia.org', |
|
'github.com', |
|
'restcountries.com' |
|
]; |
|
|
|
if (highlyTrustedDomains.some(domain => urlObj.hostname.includes(domain))) { |
|
|
|
const response = await fetch(url, { |
|
method: 'HEAD', |
|
signal: controller.signal, |
|
headers: { |
|
'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)' |
|
} |
|
}); |
|
|
|
clearTimeout(timeoutId); |
|
const isValid = response.status >= 200 && response.status < 400; |
|
console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`); |
|
return isValid; |
|
} |
|
|
|
|
|
const response = await fetch(url, { |
|
method: 'HEAD', |
|
signal: controller.signal, |
|
headers: { |
|
'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)' |
|
} |
|
}); |
|
|
|
clearTimeout(timeoutId); |
|
|
|
|
|
const isValid = response.status >= 200 && response.status < 400; |
|
console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`); |
|
return isValid; |
|
|
|
} catch (error) { |
|
console.log(`URL ${url} validation failed: ${error instanceof Error ? error.message : String(error)}`); |
|
return false; |
|
} |
|
} |
|
|
|
|
|
async function validateArxivUrl(url: string, signal: AbortSignal): Promise<boolean> { |
|
try { |
|
|
|
const match = url.match(/arxiv\.org\/abs\/(.+)$/); |
|
if (!match) { |
|
console.log(`Invalid ArXiv URL format: ${url}`); |
|
return false; |
|
} |
|
|
|
const paperId = match[1]; |
|
|
|
|
|
const validFormats = [ |
|
/^\d{4}\.\d{4,5}$/, |
|
/^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, |
|
]; |
|
|
|
const hasValidFormat = validFormats.some(regex => regex.test(paperId)); |
|
if (!hasValidFormat) { |
|
console.log(`Invalid ArXiv paper ID format: ${paperId}`); |
|
return false; |
|
} |
|
|
|
|
|
const response = await fetch(url, { |
|
method: 'GET', |
|
signal: signal, |
|
headers: { |
|
'User-Agent': 'Knowledge-Base-Browser/1.0 (ArXiv Validator)' |
|
} |
|
}); |
|
|
|
if (!response.ok) { |
|
console.log(`ArXiv URL returned ${response.status}: ${url}`); |
|
return false; |
|
} |
|
|
|
|
|
const content = await response.text(); |
|
const errorIndicators = [ |
|
'not recognized', |
|
'might instead try to search', |
|
'article identifier', |
|
'not found', |
|
'error' |
|
]; |
|
|
|
const hasError = errorIndicators.some(indicator => |
|
content.toLowerCase().includes(indicator.toLowerCase()) |
|
); |
|
|
|
if (hasError) { |
|
console.log(`ArXiv paper not found: ${url}`); |
|
return false; |
|
} |
|
|
|
console.log(`ArXiv URL validation successful: ${url}`); |
|
return true; |
|
|
|
} catch (error) { |
|
console.log(`ArXiv URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`); |
|
return false; |
|
} |
|
} |
|
|
|
|
|
async function validateContentUrl(url: string, signal: AbortSignal): Promise<boolean> { |
|
try { |
|
const response = await fetch(url, { |
|
method: 'GET', |
|
signal: signal, |
|
headers: { |
|
'User-Agent': 'Knowledge-Base-Browser/1.0 (Content Validator)' |
|
} |
|
}); |
|
|
|
if (!response.ok) { |
|
console.log(`Content URL returned ${response.status}: ${url}`); |
|
return false; |
|
} |
|
|
|
|
|
const content = await response.text(); |
|
const errorIndicators = [ |
|
'404', |
|
'not found', |
|
'page not found', |
|
'does not exist', |
|
'error', |
|
'can\'t be reached', |
|
'site is temporarily unavailable' |
|
]; |
|
|
|
const hasError = errorIndicators.some(indicator => |
|
content.toLowerCase().includes(indicator.toLowerCase()) |
|
); |
|
|
|
if (hasError) { |
|
console.log(`Content validation failed for: ${url}`); |
|
return false; |
|
} |
|
|
|
console.log(`Content URL validation successful: ${url}`); |
|
return true; |
|
|
|
} catch (error) { |
|
console.log(`Content URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`); |
|
return false; |
|
} |
|
} |
|
|
|
|
|
async function validateUrls(urls: string[], concurrencyLimit: number = 5): Promise<Map<string, boolean>> { |
|
const results = new Map<string, boolean>(); |
|
|
|
|
|
for (let i = 0; i < urls.length; i += concurrencyLimit) { |
|
const batch = urls.slice(i, i + concurrencyLimit); |
|
const batchPromises = batch.map(async (url) => { |
|
const isValid = await validateUrl(url); |
|
results.set(url, isValid); |
|
}); |
|
|
|
await Promise.all(batchPromises); |
|
} |
|
|
|
return results; |
|
} |
|
|
|
|
|
async function searchWeb(query: string, maxResults: number = 10): Promise<any[]> { |
|
const results = []; |
|
|
|
try { |
|
console.log(`Starting web search for: "${query}"`); |
|
|
|
|
|
try { |
|
|
|
const wikiSearchUrl = `https://en.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(query.replace(/\s+/g, '_'))}`; |
|
console.log('Searching Wikipedia:', wikiSearchUrl); |
|
|
|
const wikiResponse = await fetch(wikiSearchUrl, { |
|
headers: { |
|
'User-Agent': 'Knowledge-Base-Browser/1.0' |
|
}, |
|
signal: AbortSignal.timeout(3000) |
|
}); |
|
|
|
if (wikiResponse.ok) { |
|
const wikiData = await wikiResponse.json(); |
|
if (wikiData.extract && wikiData.extract.length > 50) { |
|
results.push({ |
|
title: wikiData.title, |
|
content: wikiData.extract, |
|
url: wikiData.content_urls?.desktop?.page || `https://en.wikipedia.org/wiki/${encodeURIComponent(query)}`, |
|
source: 'Wikipedia', |
|
type: 'encyclopedia' |
|
}); |
|
console.log('Found Wikipedia result:', wikiData.title); |
|
} |
|
} |
|
} catch (wikiError) { |
|
console.log('Wikipedia search failed:', wikiError instanceof Error ? wikiError.message : String(wikiError)); |
|
} |
|
|
|
|
|
if (query.toLowerCase().includes('machine learning') || |
|
query.toLowerCase().includes('neural network') || |
|
query.toLowerCase().includes('algorithm') || |
|
query.toLowerCase().includes('artificial intelligence') || |
|
query.toLowerCase().includes('data science') || |
|
query.toLowerCase().includes('deep learning')) { |
|
|
|
try { |
|
const arxivQuery = encodeURIComponent(query); |
|
const arxivUrl = `http://export.arxiv.org/api/query?search_query=all:${arxivQuery}&start=0&max_results=3&sortBy=relevance&sortOrder=descending`; |
|
console.log('Searching ArXiv for research papers'); |
|
|
|
const arxivResponse = await fetch(arxivUrl, { |
|
signal: AbortSignal.timeout(5000) |
|
}); |
|
if (arxivResponse.ok) { |
|
const arxivXml = await arxivResponse.text(); |
|
|
|
|
|
const entries = arxivXml.split('<entry>').slice(1); |
|
for (const entry of entries.slice(0, 2)) { |
|
const titleMatch = entry.match(/<title[^>]*>([^<]+)<\/title>/); |
|
const summaryMatch = entry.match(/<summary[^>]*>([^<]+)<\/summary>/); |
|
const linkMatch = entry.match(/<id[^>]*>([^<]+)<\/id>/); |
|
|
|
if (titleMatch && summaryMatch && linkMatch) { |
|
const title = titleMatch[1].trim(); |
|
const summary = summaryMatch[1].trim().substring(0, 300); |
|
const url = linkMatch[1].trim(); |
|
|
|
if (title && summary.length > 50) { |
|
results.push({ |
|
title: title, |
|
content: summary, |
|
url: url, |
|
source: 'ArXiv Research', |
|
type: 'research_paper' |
|
}); |
|
console.log('Found ArXiv paper:', title); |
|
} |
|
} |
|
} |
|
} |
|
} catch (arxivError) { |
|
console.log('ArXiv search failed:', arxivError instanceof Error ? arxivError.message : String(arxivError)); |
|
} |
|
} |
|
|
|
|
|
if (query.toLowerCase().includes('country') || query.toLowerCase().includes('nation')) { |
|
try { |
|
const countryQuery = query.replace(/country|nation/gi, '').trim(); |
|
const countryUrl = `https://restcountries.com/v3.1/name/${encodeURIComponent(countryQuery)}`; |
|
|
|
const countryResponse = await fetch(countryUrl, { |
|
signal: AbortSignal.timeout(3000) |
|
}); |
|
if (countryResponse.ok) { |
|
const countryData = await countryResponse.json(); |
|
if (Array.isArray(countryData) && countryData.length > 0) { |
|
const country = countryData[0]; |
|
results.push({ |
|
title: `${country.name.common} - Country Information`, |
|
content: `${country.name.common} is located in ${country.region}, ${country.subregion}. Capital: ${country.capital?.[0] || 'N/A'}. Population: ${country.population?.toLocaleString() || 'Unknown'}. Official languages: ${Object.values(country.languages || {}).join(', ')}.`, |
|
url: `https://en.wikipedia.org/wiki/${encodeURIComponent(country.name.common)}`, |
|
source: 'REST Countries API', |
|
type: 'geographic' |
|
}); |
|
console.log('Found country information:', country.name.common); |
|
} |
|
} |
|
} catch (countryError) { |
|
console.log('Country search failed:', countryError instanceof Error ? countryError.message : String(countryError)); |
|
} |
|
} |
|
|
|
console.log(`Web search completed. Found ${results.length} results.`); |
|
|
|
|
|
if (results.length > 0) { |
|
console.log('Validating URLs for accessibility...'); |
|
const urls = results.map(result => result.url); |
|
const validationResults = await validateUrls(urls); |
|
|
|
|
|
const validResults = results.filter(result => { |
|
const isValid = validationResults.get(result.url); |
|
if (!isValid) { |
|
console.log(`Filtered out invalid URL: ${result.url} (${result.title})`); |
|
} |
|
return isValid; |
|
}); |
|
|
|
console.log(`URL validation completed. ${validResults.length}/${results.length} URLs are accessible.`); |
|
return validResults.slice(0, maxResults); |
|
} |
|
|
|
return results.slice(0, maxResults); |
|
} catch (error) { |
|
console.error('Web search error:', error); |
|
return []; |
|
} |
|
} |
|
|
|
|
|
function transformWebResultToDocument(result: any, rank: number, query: string): any { |
|
const snippet = result.content.length > 200 ? |
|
result.content.substring(0, 200) + '...' : |
|
result.content; |
|
|
|
return { |
|
id: `web_${Date.now()}_${rank}`, |
|
title: result.title, |
|
content: result.content, |
|
snippet, |
|
source: result.source, |
|
sourceType: 'web', |
|
url: result.url, |
|
metadata: { |
|
search_type: result.type, |
|
fetched_at: new Date().toISOString() |
|
}, |
|
relevanceScore: Math.max(0.2, 0.6 - (rank * 0.1)), |
|
rank: rank + 1, |
|
searchQuery: query, |
|
retrievalTime: Math.random() * 0.2 + 0.1, |
|
tokenCount: Math.floor(result.content.length / 4) |
|
}; |
|
} |
|
|
|
async function searchGitHubRepos(query: string, maxResults: number = 10): Promise<any[]> { |
|
try { |
|
|
|
const lowerQuery = query.toLowerCase(); |
|
let searchQuery = ''; |
|
|
|
|
|
const byAuthorMatch = query.match(/by\s+([a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*)/i); |
|
if (byAuthorMatch) { |
|
const authorName = byAuthorMatch[1].trim(); |
|
const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim(); |
|
|
|
|
|
const authorSearches = [ |
|
`${topicPart} user:${authorName.replace(/\s+/g, '')}`, |
|
`${topicPart} user:${authorName.replace(/\s+/g, '')} language:python`, |
|
`${topicPart} user:${authorName.replace(/\s+/g, '')} language:"jupyter notebook"`, |
|
`${topicPart} "${authorName}"` |
|
]; |
|
|
|
|
|
searchQuery = authorSearches[0]; |
|
} else if (lowerQuery.includes('data structures') || lowerQuery.includes('algorithm')) { |
|
|
|
searchQuery = `${query} "data structures" OR "algorithms" language:python`; |
|
} else { |
|
searchQuery = `${query} language:python`; |
|
} |
|
|
|
console.log('GitHub search query:', searchQuery); |
|
|
|
const response = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(searchQuery)}&sort=stars&order=desc&per_page=${maxResults}`, { |
|
headers: { |
|
'Authorization': `token ${process.env.GITHUB_TOKEN}`, |
|
'Accept': 'application/vnd.github.v3+json', |
|
'User-Agent': 'Knowledge-Base-Browser' |
|
} |
|
}); |
|
|
|
if (!response.ok) { |
|
console.error('GitHub API error:', response.status, response.statusText); |
|
return []; |
|
} |
|
|
|
const data = await response.json(); |
|
|
|
|
|
if ((!data.items || data.items.length === 0) && byAuthorMatch) { |
|
const authorName = byAuthorMatch[1].trim(); |
|
const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim(); |
|
|
|
|
|
const fallbackQueries = [ |
|
`"${authorName}" ${topicPart}`, |
|
`${topicPart} "${authorName}"`, |
|
`${authorName} ${topicPart}`, |
|
`${topicPart} user:${authorName.replace(/\s+/g, '')}`, |
|
`${topicPart}` |
|
]; |
|
|
|
for (const fallbackQuery of fallbackQueries) { |
|
console.log('Trying fallback query:', fallbackQuery); |
|
|
|
const fallbackResponse = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(fallbackQuery)}&sort=stars&order=desc&per_page=${maxResults}`, { |
|
headers: { |
|
'Authorization': `token ${process.env.GITHUB_TOKEN}`, |
|
'Accept': 'application/vnd.github.v3+json', |
|
'User-Agent': 'Knowledge-Base-Browser' |
|
} |
|
}); |
|
|
|
if (fallbackResponse.ok) { |
|
const fallbackData = await fallbackResponse.json(); |
|
if (fallbackData.items && fallbackData.items.length > 0) { |
|
|
|
const authorFilteredResults = fallbackData.items.filter((repo: any) => |
|
repo.owner.login.toLowerCase().includes(authorName.toLowerCase()) || |
|
repo.full_name.toLowerCase().includes(authorName.toLowerCase()) || |
|
repo.description?.toLowerCase().includes(authorName.toLowerCase()) |
|
); |
|
|
|
if (authorFilteredResults.length > 0) { |
|
return authorFilteredResults; |
|
} else { |
|
return fallbackData.items; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
const repos = data.items || []; |
|
|
|
|
|
if (repos.length > 0) { |
|
console.log('Validating GitHub repository URLs...'); |
|
const urls = repos.map((repo: GitHubRepo) => repo.html_url); |
|
const validationResults = await validateUrls(urls); |
|
|
|
|
|
const validRepos = repos.filter((repo: GitHubRepo) => { |
|
const isValid = validationResults.get(repo.html_url); |
|
if (!isValid) { |
|
console.log(`Filtered out invalid GitHub repo: ${repo.html_url} (${repo.full_name})`); |
|
} |
|
return isValid; |
|
}); |
|
|
|
console.log(`GitHub URL validation completed. ${validRepos.length}/${repos.length} repositories are accessible.`); |
|
return validRepos; |
|
} |
|
|
|
return repos; |
|
} catch (error) { |
|
console.error('Error fetching GitHub repos:', error); |
|
return []; |
|
} |
|
} |
|
|
|
function transformGitHubRepoToDocument(repo: GitHubRepo, rank: number, query: string): any { |
|
const snippet = repo.description ? |
|
repo.description.substring(0, 200) + (repo.description.length > 200 ? '...' : '') : |
|
'No description available'; |
|
|
|
return { |
|
id: repo.id, |
|
title: `${repo.name} - ${repo.full_name}`, |
|
content: `${repo.description || 'No description available'}\n\nRepository: ${repo.full_name}\nLanguage: ${repo.language}\nStars: ${repo.stargazers_count}\nTopics: ${repo.topics.join(', ')}\nCreated: ${repo.created_at}\nLast Updated: ${repo.updated_at}`, |
|
snippet, |
|
source: `GitHub Repository`, |
|
sourceType: 'code', |
|
url: repo.html_url, |
|
metadata: { |
|
stars: repo.stargazers_count, |
|
language: repo.language, |
|
topics: repo.topics, |
|
created_at: repo.created_at, |
|
updated_at: repo.updated_at |
|
}, |
|
relevanceScore: Math.max(0.3, 0.7 - (rank * 0.1)), |
|
rank: rank + 1, |
|
searchQuery: query, |
|
retrievalTime: Math.random() * 0.3 + 0.1, |
|
tokenCount: Math.floor((repo.description?.length || 100) / 4) |
|
}; |
|
} |
|
|
|
export async function registerRoutes(app: Express): Promise<Server> { |
|
|
|
app.get("/api/knowledge-graph", async (req, res) => { |
|
try { |
|
const documents = await storage.getDocuments(50); |
|
|
|
const nodes: any[] = []; |
|
const links: any[] = []; |
|
|
|
|
|
documents.forEach(doc => { |
|
nodes.push({ |
|
id: `doc_${doc.id}`, |
|
label: doc.title.substring(0, 50) + (doc.title.length > 50 ? "..." : ""), |
|
type: "document", |
|
size: 12, |
|
color: "#3b82f6", |
|
metadata: { |
|
title: doc.title, |
|
sourceType: doc.sourceType, |
|
year: new Date(doc.createdAt).getFullYear(), |
|
id: doc.id |
|
} |
|
}); |
|
}); |
|
|
|
|
|
const conceptMap = new Map<string, number>(); |
|
const conceptToDocuments = new Map<string, number[]>(); |
|
|
|
documents.forEach(doc => { |
|
const content = doc.content.toLowerCase(); |
|
const concepts = [ |
|
'ai', 'artificial intelligence', 'machine learning', 'deep learning', |
|
'neural networks', 'transformer', 'attention', 'embedding', 'vector', |
|
'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini', |
|
'multimodal', 'fine-tuning', 'training', 'optimization', 'safety', |
|
'alignment', 'reasoning', 'language model', 'nlp', 'computer vision' |
|
]; |
|
|
|
concepts.forEach(concept => { |
|
if (content.includes(concept)) { |
|
conceptMap.set(concept, (conceptMap.get(concept) || 0) + 1); |
|
if (!conceptToDocuments.has(concept)) { |
|
conceptToDocuments.set(concept, []); |
|
} |
|
conceptToDocuments.get(concept)!.push(doc.id); |
|
} |
|
}); |
|
}); |
|
|
|
|
|
const documentConnections = new Map<string, Set<number>>(); |
|
|
|
documents.forEach(doc1 => { |
|
const doc1Concepts = new Set<string>(); |
|
const content1 = doc1.content.toLowerCase(); |
|
|
|
|
|
const allConcepts = [ |
|
'ai', 'artificial intelligence', 'machine learning', 'deep learning', |
|
'neural networks', 'transformer', 'attention', 'embedding', 'vector', |
|
'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini', |
|
'multimodal', 'fine-tuning', 'training', 'optimization', 'safety', |
|
'alignment', 'reasoning', 'language model', 'nlp', 'computer vision', |
|
'code generation', 'programming', 'software', 'development', 'copilot', |
|
'constitutional ai', 'rlhf', 'instruction tuning', 'benchmarks', |
|
'performance', 'efficiency', 'compression', 'quantization', 'edge ai', |
|
'mamba', 'mixture of experts', 'moe', 'architecture', 'scaling' |
|
]; |
|
|
|
allConcepts.forEach(concept => { |
|
if (content1.includes(concept)) { |
|
doc1Concepts.add(concept); |
|
} |
|
}); |
|
|
|
|
|
documents.forEach(doc2 => { |
|
if (doc1.id !== doc2.id) { |
|
const content2 = doc2.content.toLowerCase(); |
|
let sharedConcepts = 0; |
|
|
|
doc1Concepts.forEach(concept => { |
|
if (content2.includes(concept)) { |
|
sharedConcepts++; |
|
} |
|
}); |
|
|
|
|
|
if (sharedConcepts >= 3) { |
|
const connectionKey = `${Math.min(doc1.id, doc2.id)}_${Math.max(doc1.id, doc2.id)}`; |
|
if (!documentConnections.has(connectionKey)) { |
|
documentConnections.set(connectionKey, new Set([doc1.id, doc2.id])); |
|
|
|
links.push({ |
|
source: `doc_${doc1.id}`, |
|
target: `doc_${doc2.id}`, |
|
relationship: "related_concepts", |
|
strength: Math.min(sharedConcepts / 10, 1), |
|
color: "#3b82f6" |
|
}); |
|
} |
|
} |
|
} |
|
}); |
|
}); |
|
|
|
|
|
conceptMap.forEach((count, concept) => { |
|
if (count >= 2) { |
|
nodes.push({ |
|
id: `concept_${concept.replace(/\s+/g, '_')}`, |
|
label: concept, |
|
type: "concept", |
|
size: 8 + count * 2, |
|
color: "#10b981", |
|
metadata: { |
|
documentCount: count, |
|
concept: concept |
|
} |
|
}); |
|
|
|
|
|
const relatedDocs = conceptToDocuments.get(concept) || []; |
|
relatedDocs.forEach(docId => { |
|
links.push({ |
|
source: `doc_${docId}`, |
|
target: `concept_${concept.replace(/\s+/g, '_')}`, |
|
relationship: "contains_concept", |
|
strength: 1, |
|
color: "#10b981" |
|
}); |
|
}); |
|
} |
|
}); |
|
|
|
|
|
const researchTeams = new Map<string, number[]>(); |
|
documents.forEach(doc => { |
|
if (doc.metadata) { |
|
let teamName = ''; |
|
const metadata = typeof doc.metadata === 'string' ? JSON.parse(doc.metadata) : doc.metadata; |
|
|
|
|
|
if (metadata.authors && Array.isArray(metadata.authors)) { |
|
|
|
teamName = metadata.venue || 'Research Team'; |
|
} else if (metadata.venue) { |
|
teamName = metadata.venue; |
|
} else if (doc.source) { |
|
|
|
if (doc.source.includes('OpenAI')) teamName = 'OpenAI Research'; |
|
else if (doc.source.includes('Anthropic')) teamName = 'Anthropic'; |
|
else if (doc.source.includes('Google') || doc.source.includes('DeepMind')) teamName = 'Google DeepMind'; |
|
else if (doc.source.includes('LangChain')) teamName = 'LangChain Team'; |
|
else if (doc.source.includes('Research Collective')) teamName = 'AI Research Collective'; |
|
else teamName = 'Research Community'; |
|
} |
|
|
|
if (teamName) { |
|
if (!researchTeams.has(teamName)) { |
|
researchTeams.set(teamName, []); |
|
} |
|
researchTeams.get(teamName)!.push(doc.id); |
|
} |
|
} |
|
}); |
|
|
|
|
|
researchTeams.forEach((docIds, teamName) => { |
|
nodes.push({ |
|
id: `team_${teamName.replace(/\s+/g, '_')}`, |
|
label: teamName, |
|
type: "author", |
|
size: 8 + docIds.length * 2, |
|
color: "#f59e0b", |
|
metadata: { |
|
teamName: teamName, |
|
publicationCount: docIds.length |
|
} |
|
}); |
|
|
|
|
|
docIds.forEach(docId => { |
|
links.push({ |
|
source: `team_${teamName.replace(/\s+/g, '_')}`, |
|
target: `doc_${docId}`, |
|
relationship: "authored_by", |
|
strength: 0.8, |
|
color: "#f59e0b" |
|
}); |
|
}); |
|
}); |
|
|
|
|
|
const sourceTypes = new Map<string, number[]>(); |
|
documents.forEach(doc => { |
|
const sourceType = doc.sourceType || 'unknown'; |
|
if (!sourceTypes.has(sourceType)) { |
|
sourceTypes.set(sourceType, []); |
|
} |
|
sourceTypes.get(sourceType)!.push(doc.id); |
|
}); |
|
|
|
sourceTypes.forEach((docIds, sourceType) => { |
|
if (docIds.length >= 2) { |
|
nodes.push({ |
|
id: `source_${sourceType}`, |
|
label: sourceType.charAt(0).toUpperCase() + sourceType.slice(1), |
|
type: "topic", |
|
size: 10, |
|
color: "#8b5cf6", |
|
metadata: { |
|
sourceType: sourceType, |
|
documentCount: docIds.length |
|
} |
|
}); |
|
|
|
|
|
docIds.forEach(docId => { |
|
links.push({ |
|
source: `source_${sourceType}`, |
|
target: `doc_${docId}`, |
|
relationship: "categorized_as", |
|
strength: 0.6, |
|
color: "#8b5cf6" |
|
}); |
|
}); |
|
} |
|
}); |
|
|
|
res.json({ |
|
nodes, |
|
links, |
|
stats: { |
|
totalDocuments: documents.length, |
|
totalConcepts: conceptMap.size, |
|
totalResearchTeams: researchTeams.size, |
|
totalSourceTypes: sourceTypes.size |
|
} |
|
}); |
|
|
|
} catch (error) { |
|
console.error("Knowledge graph generation failed:", error); |
|
res.status(500).json({ |
|
error: "Failed to generate knowledge graph", |
|
nodes: [], |
|
links: [], |
|
stats: { totalDocuments: 0, totalConcepts: 0, totalResearchTeams: 0, totalSourceTypes: 0 } |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/search", async (req, res) => { |
|
try { |
|
const searchRequest = searchRequestSchema.parse(req.body); |
|
const streaming = req.body.streaming === true; |
|
const startTime = Date.now(); |
|
|
|
let allDocuments: any[] = []; |
|
|
|
|
|
if (searchRequest.searchType === "semantic") { |
|
console.log(`๐ Enhanced multi-source search for: "${searchRequest.query}"`); |
|
|
|
|
|
console.log('๐ Searching knowledge base...'); |
|
|
|
|
|
const queryLower = searchRequest.query.toLowerCase(); |
|
const searchQueries = [searchRequest.query]; |
|
|
|
|
|
if (queryLower.includes('mistral')) { |
|
searchQueries.push('Mixtral', 'Mistral AI'); |
|
} |
|
if (queryLower.includes('mixtral')) { |
|
searchQueries.push('Mistral', 'mixture of experts'); |
|
} |
|
if (queryLower.includes('llama')) { |
|
searchQueries.push('LLaMA', 'Large Language Model Meta AI'); |
|
} |
|
if (queryLower.includes('gpt')) { |
|
searchQueries.push('GPT', 'Generative Pre-trained Transformer'); |
|
} |
|
if (queryLower.includes('transformer') || queryLower.includes('attention')) { |
|
searchQueries.push('Attention Is All You Need', 'transformer', 'attention mechanism'); |
|
} |
|
if (queryLower.includes('constitutional')) { |
|
searchQueries.push('Constitutional AI', 'harmlessness', 'AI feedback'); |
|
} |
|
if (queryLower.includes('rag') || queryLower.includes('retrieval')) { |
|
searchQueries.push('Retrieval-Augmented Generation', 'retrieval augmented', 'knowledge-intensive'); |
|
} |
|
|
|
|
|
const allSearchResults = new Map<number, any>(); |
|
|
|
for (const query of searchQueries) { |
|
const searchResult = await storage.searchDocuments({ ...searchRequest, query }); |
|
for (const doc of searchResult.results || []) { |
|
if (!allSearchResults.has(doc.id)) { |
|
|
|
let relevanceBoost = 0; |
|
if (query !== searchRequest.query) { |
|
relevanceBoost = 0.2; |
|
} |
|
|
|
allSearchResults.set(doc.id, { |
|
...doc, |
|
relevanceScore: Math.min(doc.relevanceScore + relevanceBoost, 1.0) |
|
}); |
|
} |
|
} |
|
} |
|
|
|
allDocuments = Array.from(allSearchResults.values()); |
|
|
|
allDocuments = allDocuments.map(doc => ({ |
|
...doc, |
|
relevanceScore: Math.min(doc.relevanceScore + 0.6, 1.0), |
|
rank: doc.rank, |
|
snippet: doc.snippet || doc.content.substring(0, 200) + '...' |
|
})); |
|
|
|
console.log(`๐ Found ${allDocuments.length} local documents`); |
|
|
|
console.log(`๐ Query expansion searched for: ${searchQueries.join(', ')}`); |
|
|
|
|
|
|
|
} else { |
|
|
|
const localResults = await storage.searchDocuments(searchRequest); |
|
|
|
allDocuments = (localResults.results || []).map(doc => ({ |
|
...doc, |
|
relevanceScore: Math.min(doc.relevanceScore + 0.5, 1.0) |
|
})); |
|
} |
|
|
|
|
|
if (allDocuments.length > 0) { |
|
console.log('Validating URLs in local storage results...'); |
|
const documentsWithUrls = allDocuments.filter(doc => doc.url); |
|
|
|
if (documentsWithUrls.length > 0) { |
|
const urls = documentsWithUrls.map(doc => doc.url).filter((url): url is string => url !== null); |
|
const validationResults = await validateUrls(urls); |
|
|
|
|
|
allDocuments = allDocuments.filter(doc => { |
|
if (!doc.url) return true; |
|
|
|
const isValid = validationResults.get(doc.url); |
|
if (!isValid) { |
|
console.log(`Filtered out local document with invalid URL: ${doc.url} (${doc.title})`); |
|
} |
|
return isValid; |
|
}); |
|
|
|
console.log(`Local URL validation completed. ${allDocuments.length} documents have valid URLs.`); |
|
} |
|
} |
|
|
|
|
|
console.log(`๐ Searching external sources to supplement ${allDocuments.length} local results...`); |
|
|
|
|
|
const isCodeQuery = searchRequest.query.toLowerCase().includes('python') || |
|
searchRequest.query.toLowerCase().includes('data structures') || |
|
searchRequest.query.toLowerCase().includes('algorithm') || |
|
searchRequest.query.toLowerCase().includes('repository') || |
|
searchRequest.query.toLowerCase().includes('code') || |
|
searchRequest.query.toLowerCase().includes('programming') || |
|
searchRequest.query.toLowerCase().includes('github'); |
|
|
|
|
|
const isAIQuery = searchRequest.query.toLowerCase().includes('mistral') || |
|
searchRequest.query.toLowerCase().includes('llama') || |
|
searchRequest.query.toLowerCase().includes('transformer') || |
|
searchRequest.query.toLowerCase().includes('gpt') || |
|
searchRequest.query.toLowerCase().includes('ai') || |
|
searchRequest.query.toLowerCase().includes('machine learning') || |
|
searchRequest.query.toLowerCase().includes('neural network'); |
|
|
|
|
|
|
|
|
|
const externalSearchPromises = []; |
|
|
|
|
|
if ((isCodeQuery || isAIQuery) && process.env.GITHUB_TOKEN) { |
|
console.log('๐ Searching GitHub...'); |
|
externalSearchPromises.push( |
|
Promise.race([ |
|
searchGitHubRepos(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3))) |
|
.then(repos => ({ |
|
type: 'github', |
|
results: repos.map((repo, index) => |
|
transformGitHubRepoToDocument(repo, index + allDocuments.length, searchRequest.query) |
|
) |
|
})) |
|
.catch(error => { |
|
console.log('๐ GitHub search failed:', error.message); |
|
return { type: 'github', results: [] }; |
|
}), |
|
new Promise((_, reject) => |
|
setTimeout(() => reject(new Error('GitHub search timeout')), 8000) |
|
) |
|
]).catch(() => ({ type: 'github', results: [] })) |
|
); |
|
} |
|
|
|
|
|
console.log('๐ Searching web...'); |
|
externalSearchPromises.push( |
|
Promise.race([ |
|
searchWeb(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3))) |
|
.then(webResults => ({ |
|
type: 'web', |
|
results: webResults.map((result, index) => |
|
transformWebResultToDocument(result, index + allDocuments.length, searchRequest.query) |
|
) |
|
})) |
|
.catch(error => { |
|
console.log('๐ Web search failed:', error.message); |
|
return { type: 'web', results: [] }; |
|
}), |
|
new Promise((_, reject) => |
|
setTimeout(() => reject(new Error('Web search timeout')), 5000) |
|
) |
|
]).catch(() => ({ type: 'web', results: [] })) |
|
); |
|
|
|
|
|
if (externalSearchPromises.length > 0) { |
|
try { |
|
const externalResults = await Promise.all(externalSearchPromises); |
|
|
|
|
|
const githubResult = externalResults.find((r: any) => r?.type === 'github') as any; |
|
const webResult = externalResults.find((r: any) => r?.type === 'web') as any; |
|
const githubResults = githubResult?.results || []; |
|
const webResults = webResult?.results || []; |
|
const allExternalResults = [...githubResults, ...webResults]; |
|
|
|
console.log(`๐ Found ${allExternalResults.length} external results (GitHub: ${githubResults.length}, Web: ${webResults.length})`); |
|
|
|
|
|
if (allExternalResults.length > 0) { |
|
allDocuments = [...allDocuments, ...allExternalResults] |
|
.sort((a, b) => b.relevanceScore - a.relevanceScore) |
|
.slice(0, searchRequest.limit); |
|
} |
|
} catch (externalError: any) { |
|
console.log('๐ External search failed:', externalError?.message || externalError); |
|
} |
|
} |
|
|
|
console.log(`โ
Total results: ${allDocuments.length}`); |
|
|
|
const searchTime = (Date.now() - startTime) / 1000; |
|
const response = { |
|
results: allDocuments, |
|
totalCount: allDocuments.length, |
|
searchTime, |
|
query: searchRequest.query, |
|
queryId: Date.now() |
|
}; |
|
|
|
res.json(response); |
|
} catch (error) { |
|
if (error instanceof z.ZodError) { |
|
res.status(400).json({ message: "Invalid search request", errors: error.errors }); |
|
} else { |
|
console.error('Search error:', error); |
|
res.status(500).json({ message: "Internal server error" }); |
|
} |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/explain", async (req, res) => { |
|
try { |
|
const { title, snippet, content } = req.body; |
|
|
|
if (!title || !snippet) { |
|
return res.status(400).json({ message: "Title and snippet are required" }); |
|
} |
|
|
|
const prompt = `You are an expert communicator. Explain this document directly in a clear, conversational way suitable for audio playback. Do not show your thinking process - just provide the final explanation. |
|
|
|
Title: ${title} |
|
Content: ${snippet} |
|
|
|
Provide a brief, engaging explanation (2-3 sentences) that would be pleasant to listen to. Focus on the key concepts and practical value. Start your response immediately with the explanation.`; |
|
|
|
const response = await nebiusClient.createChatCompletion({ |
|
model: "deepseek-ai/DeepSeek-R1-0528", |
|
messages: [{ role: "user", content: prompt }], |
|
max_tokens: 150, |
|
temperature: 0.7, |
|
}); |
|
|
|
const explanation = cleanThinkingTags(response.choices[0].message.content); |
|
res.json({ explanation }); |
|
} catch (error) { |
|
console.error('AI explanation error:', error); |
|
res.status(500).json({ message: "Failed to generate explanation" }); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/ai-search", async (req, res) => { |
|
try { |
|
const { query, maxResults = 10, useQueryEnhancement = true } = req.body; |
|
|
|
if (!query || typeof query !== 'string') { |
|
return res.status(400).json({ message: "Query is required" }); |
|
} |
|
|
|
const results = await smartIngestionService.enhancedSearch(query, { |
|
maxResults, |
|
searchType: 'semantic', |
|
useQueryEnhancement |
|
}); |
|
|
|
res.json(results); |
|
} catch (error) { |
|
console.error('AI search error:', error); |
|
res.status(500).json({ |
|
message: "AI search failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/analyze-document", async (req, res) => { |
|
try { |
|
const { content, analysisType = 'summary', useMarkdown = true } = req.body; |
|
|
|
if (!content) { |
|
return res.status(400).json({ message: "Content is required" }); |
|
} |
|
|
|
const analysis = await nebiusClient.analyzeDocument({ |
|
content, |
|
analysisType, |
|
useMarkdown |
|
}); |
|
|
|
res.json(analysis); |
|
} catch (error) { |
|
console.error('Document analysis error:', error); |
|
res.status(500).json({ |
|
message: "Document analysis failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/research-synthesis", async (req, res) => { |
|
try { |
|
const { query, documentIds } = req.body; |
|
|
|
if (!query || !Array.isArray(documentIds)) { |
|
return res.status(400).json({ message: "Query and document IDs are required" }); |
|
} |
|
|
|
|
|
const documents = await Promise.all( |
|
documentIds.map(id => storage.getDocument(id)) |
|
); |
|
|
|
const validDocuments = documents.filter(Boolean); |
|
|
|
if (validDocuments.length === 0) { |
|
return res.status(400).json({ message: "No valid documents found" }); |
|
} |
|
|
|
const synthesis = await smartIngestionService.generateResearchSynthesis( |
|
query, |
|
validDocuments |
|
); |
|
|
|
res.json(synthesis); |
|
} catch (error) { |
|
console.error('Research synthesis error:', error); |
|
res.status(500).json({ |
|
message: "Research synthesis failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/enhance-query", async (req, res) => { |
|
try { |
|
const { query, context } = req.body; |
|
|
|
if (!query) { |
|
return res.status(400).json({ message: "Query is required" }); |
|
} |
|
|
|
const enhancement = await nebiusClient.enhanceQuery(query, context); |
|
|
|
|
|
enhancement.enhancedQuery = cleanThinkingTags(enhancement.enhancedQuery); |
|
enhancement.intent = cleanThinkingTags(enhancement.intent); |
|
|
|
res.json(enhancement); |
|
} catch (error) { |
|
console.error('Query enhancement error:', error); |
|
res.status(500).json({ |
|
message: "Query enhancement failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.get("/api/modal-task/:taskId", async (req, res) => { |
|
try { |
|
const { taskId } = req.params; |
|
const status = await modalClient.getTaskStatus(taskId); |
|
res.json(status); |
|
} catch (error) { |
|
console.error('Modal task status error:', error); |
|
res.status(500).json({ |
|
message: "Failed to get task status", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/batch-ingest", async (req, res) => { |
|
try { |
|
const { documents } = req.body; |
|
|
|
if (!Array.isArray(documents) || documents.length === 0) { |
|
return res.status(400).json({ message: "Documents array is required" }); |
|
} |
|
|
|
const uploads = documents.map(doc => ({ |
|
file: doc.content || '', |
|
filename: doc.filename || 'unknown.txt', |
|
contentType: doc.contentType || 'text/plain', |
|
metadata: doc.metadata || {} |
|
})); |
|
|
|
const result = await smartIngestionService.batchIngestDocuments(uploads); |
|
res.json(result); |
|
} catch (error) { |
|
console.error('Batch ingestion error:', error); |
|
res.status(500).json({ |
|
message: "Batch ingestion failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.get("/api/health", async (req, res) => { |
|
try { |
|
const { checkAPIHealth } = await import('./api-health-check'); |
|
const healthStatus = await checkAPIHealth(); |
|
|
|
const overallHealthy = healthStatus.every(status => status.status !== 'error'); |
|
|
|
res.status(overallHealthy ? 200 : 503).json({ |
|
overall: overallHealthy ? 'healthy' : 'issues_detected', |
|
services: healthStatus, |
|
timestamp: new Date().toISOString() |
|
}); |
|
} catch (error) { |
|
res.status(500).json({ |
|
overall: 'error', |
|
message: 'Health check failed', |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.post("/api/embeddings", async (req, res) => { |
|
try { |
|
const { input, model = 'text-embedding-ada-002' } = req.body; |
|
|
|
if (!input) { |
|
return res.status(400).json({ message: "Input text is required" }); |
|
} |
|
|
|
console.log('Generating embeddings for input:', input.substring(0, 100) + '...'); |
|
const embeddings = await nebiusClient.createEmbeddings({ input, model }); |
|
console.log('Embeddings generated successfully'); |
|
res.json(embeddings); |
|
} catch (error) { |
|
console.error('Embeddings error:', error); |
|
res.status(500).json({ |
|
message: "Embedding generation failed", |
|
error: error instanceof Error ? error.message : 'Unknown error' |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.get("/api/documents", async (req, res) => { |
|
try { |
|
const limit = parseInt(req.query.limit as string) || 50; |
|
const offset = parseInt(req.query.offset as string) || 0; |
|
const documents = await storage.getDocuments(limit, offset); |
|
res.json(documents); |
|
} catch (error) { |
|
res.status(500).json({ message: "Failed to fetch documents" }); |
|
} |
|
}); |
|
|
|
|
|
|
|
const isHuggingFaceSpace = process.env.SPACE_ID || process.env.HF_SPACE_ID || |
|
process.env.HUGGINGFACE_SPACE_ID || process.env.HF_TOKEN || false; |
|
const hasWritableStorage = process.env.NODE_ENV === 'production' ? |
|
fs.existsSync('/tmp') : |
|
true; |
|
|
|
|
|
const isDocumentUploadEnabled = isHuggingFaceSpace ? true : (process.env.DISABLE_UPLOADS !== 'true'); |
|
|
|
console.log('๐ Environment check:', { |
|
NODE_ENV: process.env.NODE_ENV, |
|
DISABLE_UPLOADS: process.env.DISABLE_UPLOADS, |
|
isHuggingFaceSpace: !!isHuggingFaceSpace, |
|
hasWritableStorage, |
|
isDocumentUploadEnabled |
|
}); |
|
|
|
if (isDocumentUploadEnabled) { |
|
console.log('โ
Document uploads enabled - full functionality available'); |
|
app.use("/api/documents", documentRoutes); |
|
} else { |
|
console.log('โน๏ธ Document uploads disabled - using fallback routes'); |
|
app.use("/api/documents", uploadFallbackRoutes); |
|
} |
|
|
|
const httpServer = createServer(app); |
|
return httpServer; |
|
} |