fazeel007's picture
Fix AI-enhanced search with comprehensive external source integration
cd55914
import { Express } from "express";
import { createServer, Server } from "http";
import { z } from "zod";
import fs from "fs";
import { storage } from "./storage";
import { searchRequestSchema } from "@shared/schema";
import { smartIngestionService } from "./smart-ingestion";
import { nebiusClient } from "./nebius-client";
import { modalClient } from "./modal-client";
import documentRoutes from "./document-routes";
import uploadFallbackRoutes from "./upload-fallback";
interface GitHubRepo {
id: number;
name: string;
full_name: string;
description: string;
html_url: string;
stargazers_count: number;
language: string;
topics: string[];
created_at: string;
updated_at: string;
}
// Using Nebius client instead of OpenAI for all AI operations
// Helper function to clean up DeepSeek R1 thinking tags
function cleanThinkingTags(text: string): string {
if (typeof text === 'string' && text.includes('<think>')) {
// First try to remove complete <think>...</think> pairs
let cleaned = text.replace(/<think>[\s\S]*?<\/think>\s*/g, '');
// If thinking tags remain (e.g., unclosed), remove everything from <think> onwards
if (cleaned.includes('<think>')) {
cleaned = cleaned.substring(0, cleaned.indexOf('<think>'));
}
return cleaned.trim();
}
return text;
}
// URL validation utility to check if websites are accessible and content is valid
async function validateUrl(url: string, timeout: number = 5000): Promise<boolean> {
try {
console.log(`Validating URL: ${url}`);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const urlObj = new URL(url);
// Special handling for ArXiv URLs to validate paper existence
if (urlObj.hostname.includes('arxiv.org')) {
return await validateArxivUrl(url, controller.signal);
}
// Special handling for other domains that might return 200 but show error pages
if (urlObj.hostname.includes('vldb.org') ||
urlObj.hostname.includes('cvpr.org') ||
urlObj.hostname.includes('icse.org')) {
return await validateContentUrl(url, controller.signal);
}
// Fast path for highly trusted domains
const highlyTrustedDomains = [
'wikipedia.org',
'github.com',
'restcountries.com'
];
if (highlyTrustedDomains.some(domain => urlObj.hostname.includes(domain))) {
// Still do a basic check but trust these more
const response = await fetch(url, {
method: 'HEAD',
signal: controller.signal,
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
}
});
clearTimeout(timeoutId);
const isValid = response.status >= 200 && response.status < 400;
console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
return isValid;
}
// Standard validation for other URLs
const response = await fetch(url, {
method: 'HEAD',
signal: controller.signal,
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
}
});
clearTimeout(timeoutId);
// Consider 2xx and 3xx status codes as valid
const isValid = response.status >= 200 && response.status < 400;
console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
return isValid;
} catch (error) {
console.log(`URL ${url} validation failed: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
// Special validation for ArXiv URLs to check if papers actually exist
async function validateArxivUrl(url: string, signal: AbortSignal): Promise<boolean> {
try {
// Extract paper ID from URL
const match = url.match(/arxiv\.org\/abs\/(.+)$/);
if (!match) {
console.log(`Invalid ArXiv URL format: ${url}`);
return false;
}
const paperId = match[1];
// Validate ArXiv ID format (should be like 2024.12345, cs.AI/1234567, etc.)
const validFormats = [
/^\d{4}\.\d{4,5}$/, // New format: 2024.12345
/^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
];
const hasValidFormat = validFormats.some(regex => regex.test(paperId));
if (!hasValidFormat) {
console.log(`Invalid ArXiv paper ID format: ${paperId}`);
return false;
}
// Try to fetch the paper to see if it exists
const response = await fetch(url, {
method: 'GET', // Need GET to check content
signal: signal,
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (ArXiv Validator)'
}
});
if (!response.ok) {
console.log(`ArXiv URL returned ${response.status}: ${url}`);
return false;
}
// Check if the response contains error messages
const content = await response.text();
const errorIndicators = [
'not recognized',
'might instead try to search',
'article identifier',
'not found',
'error'
];
const hasError = errorIndicators.some(indicator =>
content.toLowerCase().includes(indicator.toLowerCase())
);
if (hasError) {
console.log(`ArXiv paper not found: ${url}`);
return false;
}
console.log(`ArXiv URL validation successful: ${url}`);
return true;
} catch (error) {
console.log(`ArXiv URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
// Validation for URLs that might return 200 but show error content
async function validateContentUrl(url: string, signal: AbortSignal): Promise<boolean> {
try {
const response = await fetch(url, {
method: 'GET', // Need GET to check content
signal: signal,
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0 (Content Validator)'
}
});
if (!response.ok) {
console.log(`Content URL returned ${response.status}: ${url}`);
return false;
}
// Check if the response contains common error messages
const content = await response.text();
const errorIndicators = [
'404',
'not found',
'page not found',
'does not exist',
'error',
'can\'t be reached',
'site is temporarily unavailable'
];
const hasError = errorIndicators.some(indicator =>
content.toLowerCase().includes(indicator.toLowerCase())
);
if (hasError) {
console.log(`Content validation failed for: ${url}`);
return false;
}
console.log(`Content URL validation successful: ${url}`);
return true;
} catch (error) {
console.log(`Content URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
// Batch validate multiple URLs with concurrency limit
async function validateUrls(urls: string[], concurrencyLimit: number = 5): Promise<Map<string, boolean>> {
const results = new Map<string, boolean>();
// Process URLs in batches to avoid overwhelming the network
for (let i = 0; i < urls.length; i += concurrencyLimit) {
const batch = urls.slice(i, i + concurrencyLimit);
const batchPromises = batch.map(async (url) => {
const isValid = await validateUrl(url);
results.set(url, isValid);
});
await Promise.all(batchPromises);
}
return results;
}
// Enhanced web search using multiple authentic data sources
async function searchWeb(query: string, maxResults: number = 10): Promise<any[]> {
const results = [];
try {
console.log(`Starting web search for: "${query}"`);
// 1. Wikipedia search for general knowledge
try {
// First try Wikipedia search API
const wikiSearchUrl = `https://en.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(query.replace(/\s+/g, '_'))}`;
console.log('Searching Wikipedia:', wikiSearchUrl);
const wikiResponse = await fetch(wikiSearchUrl, {
headers: {
'User-Agent': 'Knowledge-Base-Browser/1.0'
},
signal: AbortSignal.timeout(3000) // 3 second timeout
});
if (wikiResponse.ok) {
const wikiData = await wikiResponse.json();
if (wikiData.extract && wikiData.extract.length > 50) {
results.push({
title: wikiData.title,
content: wikiData.extract,
url: wikiData.content_urls?.desktop?.page || `https://en.wikipedia.org/wiki/${encodeURIComponent(query)}`,
source: 'Wikipedia',
type: 'encyclopedia'
});
console.log('Found Wikipedia result:', wikiData.title);
}
}
} catch (wikiError) {
console.log('Wikipedia search failed:', wikiError instanceof Error ? wikiError.message : String(wikiError));
}
// 2. ArXiv search for research papers (for ML/AI/CS topics)
if (query.toLowerCase().includes('machine learning') ||
query.toLowerCase().includes('neural network') ||
query.toLowerCase().includes('algorithm') ||
query.toLowerCase().includes('artificial intelligence') ||
query.toLowerCase().includes('data science') ||
query.toLowerCase().includes('deep learning')) {
try {
const arxivQuery = encodeURIComponent(query);
const arxivUrl = `http://export.arxiv.org/api/query?search_query=all:${arxivQuery}&start=0&max_results=3&sortBy=relevance&sortOrder=descending`;
console.log('Searching ArXiv for research papers');
const arxivResponse = await fetch(arxivUrl, {
signal: AbortSignal.timeout(5000) // 5 second timeout
});
if (arxivResponse.ok) {
const arxivXml = await arxivResponse.text();
// Parse ArXiv XML response
const entries = arxivXml.split('<entry>').slice(1);
for (const entry of entries.slice(0, 2)) {
const titleMatch = entry.match(/<title[^>]*>([^<]+)<\/title>/);
const summaryMatch = entry.match(/<summary[^>]*>([^<]+)<\/summary>/);
const linkMatch = entry.match(/<id[^>]*>([^<]+)<\/id>/);
if (titleMatch && summaryMatch && linkMatch) {
const title = titleMatch[1].trim();
const summary = summaryMatch[1].trim().substring(0, 300);
const url = linkMatch[1].trim();
if (title && summary.length > 50) {
results.push({
title: title,
content: summary,
url: url,
source: 'ArXiv Research',
type: 'research_paper'
});
console.log('Found ArXiv paper:', title);
}
}
}
}
} catch (arxivError) {
console.log('ArXiv search failed:', arxivError instanceof Error ? arxivError.message : String(arxivError));
}
}
// 3. Try REST Countries API for country-related queries
if (query.toLowerCase().includes('country') || query.toLowerCase().includes('nation')) {
try {
const countryQuery = query.replace(/country|nation/gi, '').trim();
const countryUrl = `https://restcountries.com/v3.1/name/${encodeURIComponent(countryQuery)}`;
const countryResponse = await fetch(countryUrl, {
signal: AbortSignal.timeout(3000) // 3 second timeout
});
if (countryResponse.ok) {
const countryData = await countryResponse.json();
if (Array.isArray(countryData) && countryData.length > 0) {
const country = countryData[0];
results.push({
title: `${country.name.common} - Country Information`,
content: `${country.name.common} is located in ${country.region}, ${country.subregion}. Capital: ${country.capital?.[0] || 'N/A'}. Population: ${country.population?.toLocaleString() || 'Unknown'}. Official languages: ${Object.values(country.languages || {}).join(', ')}.`,
url: `https://en.wikipedia.org/wiki/${encodeURIComponent(country.name.common)}`,
source: 'REST Countries API',
type: 'geographic'
});
console.log('Found country information:', country.name.common);
}
}
} catch (countryError) {
console.log('Country search failed:', countryError instanceof Error ? countryError.message : String(countryError));
}
}
console.log(`Web search completed. Found ${results.length} results.`);
// Validate URLs before returning results
if (results.length > 0) {
console.log('Validating URLs for accessibility...');
const urls = results.map(result => result.url);
const validationResults = await validateUrls(urls);
// Filter out results with invalid URLs
const validResults = results.filter(result => {
const isValid = validationResults.get(result.url);
if (!isValid) {
console.log(`Filtered out invalid URL: ${result.url} (${result.title})`);
}
return isValid;
});
console.log(`URL validation completed. ${validResults.length}/${results.length} URLs are accessible.`);
return validResults.slice(0, maxResults);
}
return results.slice(0, maxResults);
} catch (error) {
console.error('Web search error:', error);
return [];
}
}
// Transform web search results to document format
function transformWebResultToDocument(result: any, rank: number, query: string): any {
const snippet = result.content.length > 200 ?
result.content.substring(0, 200) + '...' :
result.content;
return {
id: `web_${Date.now()}_${rank}`,
title: result.title,
content: result.content,
snippet,
source: result.source,
sourceType: 'web',
url: result.url,
metadata: {
search_type: result.type,
fetched_at: new Date().toISOString()
},
relevanceScore: Math.max(0.2, 0.6 - (rank * 0.1)), // Lower scores for external results
rank: rank + 1,
searchQuery: query,
retrievalTime: Math.random() * 0.2 + 0.1,
tokenCount: Math.floor(result.content.length / 4)
};
}
async function searchGitHubRepos(query: string, maxResults: number = 10): Promise<any[]> {
try {
// Parse query to extract author and repository details
const lowerQuery = query.toLowerCase();
let searchQuery = '';
// Check if query contains "by [author]" pattern - handle multiple name formats
const byAuthorMatch = query.match(/by\s+([a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*)/i);
if (byAuthorMatch) {
const authorName = byAuthorMatch[1].trim();
const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();
// Try different author search strategies - include multiple language options
const authorSearches = [
`${topicPart} user:${authorName.replace(/\s+/g, '')}`, // No language restriction first
`${topicPart} user:${authorName.replace(/\s+/g, '')} language:python`,
`${topicPart} user:${authorName.replace(/\s+/g, '')} language:"jupyter notebook"`,
`${topicPart} "${authorName}"` // Search in description/readme
];
// Use the first search strategy
searchQuery = authorSearches[0];
} else if (lowerQuery.includes('data structures') || lowerQuery.includes('algorithm')) {
// Enhanced search for data structures and algorithms
searchQuery = `${query} "data structures" OR "algorithms" language:python`;
} else {
searchQuery = `${query} language:python`;
}
console.log('GitHub search query:', searchQuery);
const response = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(searchQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
headers: {
'Authorization': `token ${process.env.GITHUB_TOKEN}`,
'Accept': 'application/vnd.github.v3+json',
'User-Agent': 'Knowledge-Base-Browser'
}
});
if (!response.ok) {
console.error('GitHub API error:', response.status, response.statusText);
return [];
}
const data = await response.json();
// If no results with author search, try alternative search strategies
if ((!data.items || data.items.length === 0) && byAuthorMatch) {
const authorName = byAuthorMatch[1].trim();
const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();
// Try different fallback strategies without language restrictions
const fallbackQueries = [
`"${authorName}" ${topicPart}`,
`${topicPart} "${authorName}"`,
`${authorName} ${topicPart}`,
`${topicPart} user:${authorName.replace(/\s+/g, '')}`,
`${topicPart}`
];
for (const fallbackQuery of fallbackQueries) {
console.log('Trying fallback query:', fallbackQuery);
const fallbackResponse = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(fallbackQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
headers: {
'Authorization': `token ${process.env.GITHUB_TOKEN}`,
'Accept': 'application/vnd.github.v3+json',
'User-Agent': 'Knowledge-Base-Browser'
}
});
if (fallbackResponse.ok) {
const fallbackData = await fallbackResponse.json();
if (fallbackData.items && fallbackData.items.length > 0) {
// Filter results to prioritize those from the specified author
const authorFilteredResults = fallbackData.items.filter((repo: any) =>
repo.owner.login.toLowerCase().includes(authorName.toLowerCase()) ||
repo.full_name.toLowerCase().includes(authorName.toLowerCase()) ||
repo.description?.toLowerCase().includes(authorName.toLowerCase())
);
if (authorFilteredResults.length > 0) {
return authorFilteredResults;
} else {
return fallbackData.items;
}
}
}
}
}
const repos = data.items || [];
// Validate GitHub repository URLs (though GitHub repos are usually reliable)
if (repos.length > 0) {
console.log('Validating GitHub repository URLs...');
const urls = repos.map((repo: GitHubRepo) => repo.html_url);
const validationResults = await validateUrls(urls);
// Filter out repos with invalid URLs
const validRepos = repos.filter((repo: GitHubRepo) => {
const isValid = validationResults.get(repo.html_url);
if (!isValid) {
console.log(`Filtered out invalid GitHub repo: ${repo.html_url} (${repo.full_name})`);
}
return isValid;
});
console.log(`GitHub URL validation completed. ${validRepos.length}/${repos.length} repositories are accessible.`);
return validRepos;
}
return repos;
} catch (error) {
console.error('Error fetching GitHub repos:', error);
return [];
}
}
function transformGitHubRepoToDocument(repo: GitHubRepo, rank: number, query: string): any {
const snippet = repo.description ?
repo.description.substring(0, 200) + (repo.description.length > 200 ? '...' : '') :
'No description available';
return {
id: repo.id,
title: `${repo.name} - ${repo.full_name}`,
content: `${repo.description || 'No description available'}\n\nRepository: ${repo.full_name}\nLanguage: ${repo.language}\nStars: ${repo.stargazers_count}\nTopics: ${repo.topics.join(', ')}\nCreated: ${repo.created_at}\nLast Updated: ${repo.updated_at}`,
snippet,
source: `GitHub Repository`,
sourceType: 'code',
url: repo.html_url,
metadata: {
stars: repo.stargazers_count,
language: repo.language,
topics: repo.topics,
created_at: repo.created_at,
updated_at: repo.updated_at
},
relevanceScore: Math.max(0.3, 0.7 - (rank * 0.1)), // Lower scores for GitHub results
rank: rank + 1,
searchQuery: query,
retrievalTime: Math.random() * 0.3 + 0.1,
tokenCount: Math.floor((repo.description?.length || 100) / 4)
};
}
export async function registerRoutes(app: Express): Promise<Server> {
// Knowledge graph data endpoint
app.get("/api/knowledge-graph", async (req, res) => {
try {
const documents = await storage.getDocuments(50);
const nodes: any[] = [];
const links: any[] = [];
// Create document nodes from actual storage
documents.forEach(doc => {
nodes.push({
id: `doc_${doc.id}`,
label: doc.title.substring(0, 50) + (doc.title.length > 50 ? "..." : ""),
type: "document",
size: 12,
color: "#3b82f6",
metadata: {
title: doc.title,
sourceType: doc.sourceType,
year: new Date(doc.createdAt).getFullYear(),
id: doc.id
}
});
});
// Extract concepts from document content
const conceptMap = new Map<string, number>();
const conceptToDocuments = new Map<string, number[]>();
documents.forEach(doc => {
const content = doc.content.toLowerCase();
const concepts = [
'ai', 'artificial intelligence', 'machine learning', 'deep learning',
'neural networks', 'transformer', 'attention', 'embedding', 'vector',
'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
'alignment', 'reasoning', 'language model', 'nlp', 'computer vision'
];
concepts.forEach(concept => {
if (content.includes(concept)) {
conceptMap.set(concept, (conceptMap.get(concept) || 0) + 1);
if (!conceptToDocuments.has(concept)) {
conceptToDocuments.set(concept, []);
}
conceptToDocuments.get(concept)!.push(doc.id);
}
});
});
// Create document-to-document connections based on shared concepts
const documentConnections = new Map<string, Set<number>>();
documents.forEach(doc1 => {
const doc1Concepts = new Set<string>();
const content1 = doc1.content.toLowerCase();
// Enhanced concept detection for better connections
const allConcepts = [
'ai', 'artificial intelligence', 'machine learning', 'deep learning',
'neural networks', 'transformer', 'attention', 'embedding', 'vector',
'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
'alignment', 'reasoning', 'language model', 'nlp', 'computer vision',
'code generation', 'programming', 'software', 'development', 'copilot',
'constitutional ai', 'rlhf', 'instruction tuning', 'benchmarks',
'performance', 'efficiency', 'compression', 'quantization', 'edge ai',
'mamba', 'mixture of experts', 'moe', 'architecture', 'scaling'
];
allConcepts.forEach(concept => {
if (content1.includes(concept)) {
doc1Concepts.add(concept);
}
});
// Find related documents with shared concepts
documents.forEach(doc2 => {
if (doc1.id !== doc2.id) {
const content2 = doc2.content.toLowerCase();
let sharedConcepts = 0;
doc1Concepts.forEach(concept => {
if (content2.includes(concept)) {
sharedConcepts++;
}
});
// Create connection if documents share 3+ concepts
if (sharedConcepts >= 3) {
const connectionKey = `${Math.min(doc1.id, doc2.id)}_${Math.max(doc1.id, doc2.id)}`;
if (!documentConnections.has(connectionKey)) {
documentConnections.set(connectionKey, new Set([doc1.id, doc2.id]));
links.push({
source: `doc_${doc1.id}`,
target: `doc_${doc2.id}`,
relationship: "related_concepts",
strength: Math.min(sharedConcepts / 10, 1),
color: "#3b82f6"
});
}
}
}
});
});
// Create concept nodes for concepts that appear in multiple documents
conceptMap.forEach((count, concept) => {
if (count >= 2) {
nodes.push({
id: `concept_${concept.replace(/\s+/g, '_')}`,
label: concept,
type: "concept",
size: 8 + count * 2,
color: "#10b981",
metadata: {
documentCount: count,
concept: concept
}
});
// Link concept to documents
const relatedDocs = conceptToDocuments.get(concept) || [];
relatedDocs.forEach(docId => {
links.push({
source: `doc_${docId}`,
target: `concept_${concept.replace(/\s+/g, '_')}`,
relationship: "contains_concept",
strength: 1,
color: "#10b981"
});
});
}
});
// Extract research teams from document metadata
const researchTeams = new Map<string, number[]>();
documents.forEach(doc => {
if (doc.metadata) {
let teamName = '';
const metadata = typeof doc.metadata === 'string' ? JSON.parse(doc.metadata) : doc.metadata;
// Extract team names from authors or venue
if (metadata.authors && Array.isArray(metadata.authors)) {
// Use first author's affiliation or create team from venue
teamName = metadata.venue || 'Research Team';
} else if (metadata.venue) {
teamName = metadata.venue;
} else if (doc.source) {
// Extract team from source
if (doc.source.includes('OpenAI')) teamName = 'OpenAI Research';
else if (doc.source.includes('Anthropic')) teamName = 'Anthropic';
else if (doc.source.includes('Google') || doc.source.includes('DeepMind')) teamName = 'Google DeepMind';
else if (doc.source.includes('LangChain')) teamName = 'LangChain Team';
else if (doc.source.includes('Research Collective')) teamName = 'AI Research Collective';
else teamName = 'Research Community';
}
if (teamName) {
if (!researchTeams.has(teamName)) {
researchTeams.set(teamName, []);
}
researchTeams.get(teamName)!.push(doc.id);
}
}
});
// Create research team nodes
researchTeams.forEach((docIds, teamName) => {
nodes.push({
id: `team_${teamName.replace(/\s+/g, '_')}`,
label: teamName,
type: "author",
size: 8 + docIds.length * 2,
color: "#f59e0b",
metadata: {
teamName: teamName,
publicationCount: docIds.length
}
});
// Link team to documents
docIds.forEach(docId => {
links.push({
source: `team_${teamName.replace(/\s+/g, '_')}`,
target: `doc_${docId}`,
relationship: "authored_by",
strength: 0.8,
color: "#f59e0b"
});
});
});
// Create source type clusters
const sourceTypes = new Map<string, number[]>();
documents.forEach(doc => {
const sourceType = doc.sourceType || 'unknown';
if (!sourceTypes.has(sourceType)) {
sourceTypes.set(sourceType, []);
}
sourceTypes.get(sourceType)!.push(doc.id);
});
sourceTypes.forEach((docIds, sourceType) => {
if (docIds.length >= 2) {
nodes.push({
id: `source_${sourceType}`,
label: sourceType.charAt(0).toUpperCase() + sourceType.slice(1),
type: "topic",
size: 10,
color: "#8b5cf6",
metadata: {
sourceType: sourceType,
documentCount: docIds.length
}
});
// Link source type to documents
docIds.forEach(docId => {
links.push({
source: `source_${sourceType}`,
target: `doc_${docId}`,
relationship: "categorized_as",
strength: 0.6,
color: "#8b5cf6"
});
});
}
});
res.json({
nodes,
links,
stats: {
totalDocuments: documents.length,
totalConcepts: conceptMap.size,
totalResearchTeams: researchTeams.size,
totalSourceTypes: sourceTypes.size
}
});
} catch (error) {
console.error("Knowledge graph generation failed:", error);
res.status(500).json({
error: "Failed to generate knowledge graph",
nodes: [],
links: [],
stats: { totalDocuments: 0, totalConcepts: 0, totalResearchTeams: 0, totalSourceTypes: 0 }
});
}
});
// Enhanced search with web fallback
app.post("/api/search", async (req, res) => {
try {
const searchRequest = searchRequestSchema.parse(req.body);
const streaming = req.body.streaming === true;
const startTime = Date.now();
let allDocuments: any[] = [];
// Enhanced multi-source search for semantic queries
if (searchRequest.searchType === "semantic") {
console.log(`๐Ÿ” Enhanced multi-source search for: "${searchRequest.query}"`);
// 1. First, always do keyword search on knowledge base
console.log('๐Ÿ“š Searching knowledge base...');
// Enhanced query expansion with multiple search attempts
const queryLower = searchRequest.query.toLowerCase();
const searchQueries = [searchRequest.query]; // Start with original query
// Add related terms for better matching
if (queryLower.includes('mistral')) {
searchQueries.push('Mixtral', 'Mistral AI');
}
if (queryLower.includes('mixtral')) {
searchQueries.push('Mistral', 'mixture of experts');
}
if (queryLower.includes('llama')) {
searchQueries.push('LLaMA', 'Large Language Model Meta AI');
}
if (queryLower.includes('gpt')) {
searchQueries.push('GPT', 'Generative Pre-trained Transformer');
}
if (queryLower.includes('transformer') || queryLower.includes('attention')) {
searchQueries.push('Attention Is All You Need', 'transformer', 'attention mechanism');
}
if (queryLower.includes('constitutional')) {
searchQueries.push('Constitutional AI', 'harmlessness', 'AI feedback');
}
if (queryLower.includes('rag') || queryLower.includes('retrieval')) {
searchQueries.push('Retrieval-Augmented Generation', 'retrieval augmented', 'knowledge-intensive');
}
// Search with each query and combine results
const allSearchResults = new Map<number, any>();
for (const query of searchQueries) {
const searchResult = await storage.searchDocuments({ ...searchRequest, query });
for (const doc of searchResult.results || []) {
if (!allSearchResults.has(doc.id)) {
// Boost relevance for exact matches with expanded terms
let relevanceBoost = 0;
if (query !== searchRequest.query) {
relevanceBoost = 0.2; // Boost expanded term matches
}
allSearchResults.set(doc.id, {
...doc,
relevanceScore: Math.min(doc.relevanceScore + relevanceBoost, 1.0)
});
}
}
}
allDocuments = Array.from(allSearchResults.values());
allDocuments = allDocuments.map(doc => ({
...doc,
relevanceScore: Math.min(doc.relevanceScore + 0.6, 1.0), // Boost local results
rank: doc.rank,
snippet: doc.snippet || doc.content.substring(0, 200) + '...'
}));
console.log(`๐Ÿ“š Found ${allDocuments.length} local documents`);
console.log(`๐Ÿ“š Query expansion searched for: ${searchQueries.join(', ')}`);
// Skip AI enhancement for now to test query expansion
// TODO: Re-enable AI enhancement after fixing query expansion
} else {
// Use regular keyword search for other search types
const localResults = await storage.searchDocuments(searchRequest);
// Boost relevance scores for knowledge base documents to prioritize them
allDocuments = (localResults.results || []).map(doc => ({
...doc,
relevanceScore: Math.min(doc.relevanceScore + 0.5, 1.0) // Boost by 0.5
}));
}
// Validate URLs in local storage results as well
if (allDocuments.length > 0) {
console.log('Validating URLs in local storage results...');
const documentsWithUrls = allDocuments.filter(doc => doc.url);
if (documentsWithUrls.length > 0) {
const urls = documentsWithUrls.map(doc => doc.url).filter((url): url is string => url !== null);
const validationResults = await validateUrls(urls);
// Filter out documents with invalid URLs
allDocuments = allDocuments.filter(doc => {
if (!doc.url) return true; // Keep documents without URLs
const isValid = validationResults.get(doc.url);
if (!isValid) {
console.log(`Filtered out local document with invalid URL: ${doc.url} (${doc.title})`);
}
return isValid;
});
console.log(`Local URL validation completed. ${allDocuments.length} documents have valid URLs.`);
}
}
// Always search external sources to provide comprehensive results
console.log(`๐ŸŒ Searching external sources to supplement ${allDocuments.length} local results...`);
// Check if we should search GitHub
const isCodeQuery = searchRequest.query.toLowerCase().includes('python') ||
searchRequest.query.toLowerCase().includes('data structures') ||
searchRequest.query.toLowerCase().includes('algorithm') ||
searchRequest.query.toLowerCase().includes('repository') ||
searchRequest.query.toLowerCase().includes('code') ||
searchRequest.query.toLowerCase().includes('programming') ||
searchRequest.query.toLowerCase().includes('github');
// Enhanced keyword detection for AI/ML queries that might have relevant code
const isAIQuery = searchRequest.query.toLowerCase().includes('mistral') ||
searchRequest.query.toLowerCase().includes('llama') ||
searchRequest.query.toLowerCase().includes('transformer') ||
searchRequest.query.toLowerCase().includes('gpt') ||
searchRequest.query.toLowerCase().includes('ai') ||
searchRequest.query.toLowerCase().includes('machine learning') ||
searchRequest.query.toLowerCase().includes('neural network');
// Query analysis for external search triggers
// Enhanced external search with better error handling and timeouts
const externalSearchPromises = [];
// GitHub search for code and AI-related queries
if ((isCodeQuery || isAIQuery) && process.env.GITHUB_TOKEN) {
console.log('๐Ÿ™ Searching GitHub...');
externalSearchPromises.push(
Promise.race([
searchGitHubRepos(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
.then(repos => ({
type: 'github',
results: repos.map((repo, index) =>
transformGitHubRepoToDocument(repo, index + allDocuments.length, searchRequest.query)
)
}))
.catch(error => {
console.log('๐Ÿ™ GitHub search failed:', error.message);
return { type: 'github', results: [] };
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('GitHub search timeout')), 8000)
)
]).catch(() => ({ type: 'github', results: [] }))
);
}
// Always include web search for comprehensive coverage
console.log('๐ŸŒ Searching web...');
externalSearchPromises.push(
Promise.race([
searchWeb(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
.then(webResults => ({
type: 'web',
results: webResults.map((result, index) =>
transformWebResultToDocument(result, index + allDocuments.length, searchRequest.query)
)
}))
.catch(error => {
console.log('๐ŸŒ Web search failed:', error.message);
return { type: 'web', results: [] };
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Web search timeout')), 5000)
)
]).catch(() => ({ type: 'web', results: [] }))
);
// Wait for external searches with timeout protection
if (externalSearchPromises.length > 0) {
try {
const externalResults = await Promise.all(externalSearchPromises);
// Flatten and combine results
const githubResult = externalResults.find((r: any) => r?.type === 'github') as any;
const webResult = externalResults.find((r: any) => r?.type === 'web') as any;
const githubResults = githubResult?.results || [];
const webResults = webResult?.results || [];
const allExternalResults = [...githubResults, ...webResults];
console.log(`๐ŸŒ Found ${allExternalResults.length} external results (GitHub: ${githubResults.length}, Web: ${webResults.length})`);
// Combine local and external results, keeping local results prioritized
if (allExternalResults.length > 0) {
allDocuments = [...allDocuments, ...allExternalResults]
.sort((a, b) => b.relevanceScore - a.relevanceScore)
.slice(0, searchRequest.limit);
}
} catch (externalError: any) {
console.log('๐ŸŒ External search failed:', externalError?.message || externalError);
}
}
console.log(`โœ… Total results: ${allDocuments.length}`);
const searchTime = (Date.now() - startTime) / 1000;
const response = {
results: allDocuments,
totalCount: allDocuments.length,
searchTime,
query: searchRequest.query,
queryId: Date.now()
};
res.json(response);
} catch (error) {
if (error instanceof z.ZodError) {
res.status(400).json({ message: "Invalid search request", errors: error.errors });
} else {
console.error('Search error:', error);
res.status(500).json({ message: "Internal server error" });
}
}
});
// AI explanation endpoint using Nebius
app.post("/api/explain", async (req, res) => {
try {
const { title, snippet, content } = req.body;
if (!title || !snippet) {
return res.status(400).json({ message: "Title and snippet are required" });
}
const prompt = `You are an expert communicator. Explain this document directly in a clear, conversational way suitable for audio playback. Do not show your thinking process - just provide the final explanation.
Title: ${title}
Content: ${snippet}
Provide a brief, engaging explanation (2-3 sentences) that would be pleasant to listen to. Focus on the key concepts and practical value. Start your response immediately with the explanation.`;
const response = await nebiusClient.createChatCompletion({
model: "deepseek-ai/DeepSeek-R1-0528", // Using DeepSeek model via Nebius
messages: [{ role: "user", content: prompt }],
max_tokens: 150,
temperature: 0.7,
});
const explanation = cleanThinkingTags(response.choices[0].message.content);
res.json({ explanation });
} catch (error) {
console.error('AI explanation error:', error);
res.status(500).json({ message: "Failed to generate explanation" });
}
});
// Enhanced AI-powered search using Nebius and Modal
app.post("/api/ai-search", async (req, res) => {
try {
const { query, maxResults = 10, useQueryEnhancement = true } = req.body;
if (!query || typeof query !== 'string') {
return res.status(400).json({ message: "Query is required" });
}
const results = await smartIngestionService.enhancedSearch(query, {
maxResults,
searchType: 'semantic',
useQueryEnhancement
});
res.json(results);
} catch (error) {
console.error('AI search error:', error);
res.status(500).json({
message: "AI search failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Document analysis using Nebius AI
app.post("/api/analyze-document", async (req, res) => {
try {
const { content, analysisType = 'summary', useMarkdown = true } = req.body;
if (!content) {
return res.status(400).json({ message: "Content is required" });
}
const analysis = await nebiusClient.analyzeDocument({
content,
analysisType,
useMarkdown
});
res.json(analysis);
} catch (error) {
console.error('Document analysis error:', error);
res.status(500).json({
message: "Document analysis failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Research synthesis using Nebius AI
app.post("/api/research-synthesis", async (req, res) => {
try {
const { query, documentIds } = req.body;
if (!query || !Array.isArray(documentIds)) {
return res.status(400).json({ message: "Query and document IDs are required" });
}
// Get documents from storage
const documents = await Promise.all(
documentIds.map(id => storage.getDocument(id))
);
const validDocuments = documents.filter(Boolean);
if (validDocuments.length === 0) {
return res.status(400).json({ message: "No valid documents found" });
}
const synthesis = await smartIngestionService.generateResearchSynthesis(
query,
validDocuments
);
res.json(synthesis);
} catch (error) {
console.error('Research synthesis error:', error);
res.status(500).json({
message: "Research synthesis failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Query enhancement using Nebius AI
app.post("/api/enhance-query", async (req, res) => {
try {
const { query, context } = req.body;
if (!query) {
return res.status(400).json({ message: "Query is required" });
}
const enhancement = await nebiusClient.enhanceQuery(query, context);
// Clean up any thinking tags that might appear in string fields
enhancement.enhancedQuery = cleanThinkingTags(enhancement.enhancedQuery);
enhancement.intent = cleanThinkingTags(enhancement.intent);
res.json(enhancement);
} catch (error) {
console.error('Query enhancement error:', error);
res.status(500).json({
message: "Query enhancement failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Modal processing status endpoint
app.get("/api/modal-task/:taskId", async (req, res) => {
try {
const { taskId } = req.params;
const status = await modalClient.getTaskStatus(taskId);
res.json(status);
} catch (error) {
console.error('Modal task status error:', error);
res.status(500).json({
message: "Failed to get task status",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Batch document ingestion using Modal
app.post("/api/batch-ingest", async (req, res) => {
try {
const { documents } = req.body;
if (!Array.isArray(documents) || documents.length === 0) {
return res.status(400).json({ message: "Documents array is required" });
}
const uploads = documents.map(doc => ({
file: doc.content || '',
filename: doc.filename || 'unknown.txt',
contentType: doc.contentType || 'text/plain',
metadata: doc.metadata || {}
}));
const result = await smartIngestionService.batchIngestDocuments(uploads);
res.json(result);
} catch (error) {
console.error('Batch ingestion error:', error);
res.status(500).json({
message: "Batch ingestion failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// API Health Check endpoint
app.get("/api/health", async (req, res) => {
try {
const { checkAPIHealth } = await import('./api-health-check');
const healthStatus = await checkAPIHealth();
const overallHealthy = healthStatus.every(status => status.status !== 'error');
res.status(overallHealthy ? 200 : 503).json({
overall: overallHealthy ? 'healthy' : 'issues_detected',
services: healthStatus,
timestamp: new Date().toISOString()
});
} catch (error) {
res.status(500).json({
overall: 'error',
message: 'Health check failed',
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Generate embeddings using Nebius
app.post("/api/embeddings", async (req, res) => {
try {
const { input, model = 'text-embedding-ada-002' } = req.body;
if (!input) {
return res.status(400).json({ message: "Input text is required" });
}
console.log('Generating embeddings for input:', input.substring(0, 100) + '...');
const embeddings = await nebiusClient.createEmbeddings({ input, model });
console.log('Embeddings generated successfully');
res.json(embeddings);
} catch (error) {
console.error('Embeddings error:', error);
res.status(500).json({
message: "Embedding generation failed",
error: error instanceof Error ? error.message : 'Unknown error'
});
}
});
// Other routes...
app.get("/api/documents", async (req, res) => {
try {
const limit = parseInt(req.query.limit as string) || 50;
const offset = parseInt(req.query.offset as string) || 0;
const documents = await storage.getDocuments(limit, offset);
res.json(documents);
} catch (error) {
res.status(500).json({ message: "Failed to fetch documents" });
}
});
// Register document routes - enable uploads by default for all environments
// Hugging Face Spaces have /tmp storage which is suitable for uploads
const isHuggingFaceSpace = process.env.SPACE_ID || process.env.HF_SPACE_ID ||
process.env.HUGGINGFACE_SPACE_ID || process.env.HF_TOKEN || false;
const hasWritableStorage = process.env.NODE_ENV === 'production' ?
fs.existsSync('/tmp') :
true; // Development always has writable storage
// Force enable uploads for Hugging Face Spaces, otherwise check DISABLE_UPLOADS
const isDocumentUploadEnabled = isHuggingFaceSpace ? true : (process.env.DISABLE_UPLOADS !== 'true');
console.log('๐Ÿ” Environment check:', {
NODE_ENV: process.env.NODE_ENV,
DISABLE_UPLOADS: process.env.DISABLE_UPLOADS,
isHuggingFaceSpace: !!isHuggingFaceSpace,
hasWritableStorage,
isDocumentUploadEnabled
});
if (isDocumentUploadEnabled) {
console.log('โœ… Document uploads enabled - full functionality available');
app.use("/api/documents", documentRoutes);
} else {
console.log('โ„น๏ธ Document uploads disabled - using fallback routes');
app.use("/api/documents", uploadFallbackRoutes);
}
const httpServer = createServer(app);
return httpServer;
}