Spaces:

Agents-MCP-Hackathon
/

KnowledgeBridge

Running

App Files Files Community

KnowledgeBridge / server /routes.ts

fazeel007

Fix AI-enhanced search with comprehensive external source integration

cd55914 18 days ago

raw

history blame contribute delete

49 kB

	import { Express } from "express";
	import { createServer, Server } from "http";
	import { z } from "zod";
	import fs from "fs";
	import { storage } from "./storage";
	import { searchRequestSchema } from "@shared/schema";
	import { smartIngestionService } from "./smart-ingestion";
	import { nebiusClient } from "./nebius-client";
	import { modalClient } from "./modal-client";
	import documentRoutes from "./document-routes";
	import uploadFallbackRoutes from "./upload-fallback";

	interface GitHubRepo {
	id: number;
	name: string;
	full_name: string;
	description: string;
	html_url: string;
	stargazers_count: number;
	language: string;
	topics: string[];
	created_at: string;
	updated_at: string;
	}

	// Using Nebius client instead of OpenAI for all AI operations

	// Helper function to clean up DeepSeek R1 thinking tags
	function cleanThinkingTags(text: string): string {
	if (typeof text === 'string' && text.includes('<think>')) {
	// First try to remove complete <think>...</think> pairs
	let cleaned = text.replace(/<think>[\s\S]?<\/think>\s/g, '');

	// If thinking tags remain (e.g., unclosed), remove everything from <think> onwards
	if (cleaned.includes('<think>')) {
	cleaned = cleaned.substring(0, cleaned.indexOf('<think>'));
	}

	return cleaned.trim();
	}
	return text;
	}

	// URL validation utility to check if websites are accessible and content is valid
	async function validateUrl(url: string, timeout: number = 5000): Promise<boolean> {
	try {
	console.log(`Validating URL: ${url}`);
	const controller = new AbortController();
	const timeoutId = setTimeout(() => controller.abort(), timeout);

	const urlObj = new URL(url);

	// Special handling for ArXiv URLs to validate paper existence
	if (urlObj.hostname.includes('arxiv.org')) {
	return await validateArxivUrl(url, controller.signal);
	}

	// Special handling for other domains that might return 200 but show error pages
	if (urlObj.hostname.includes('vldb.org') \|\|
	urlObj.hostname.includes('cvpr.org') \|\|
	urlObj.hostname.includes('icse.org')) {
	return await validateContentUrl(url, controller.signal);
	}

	// Fast path for highly trusted domains
	const highlyTrustedDomains = [
	'wikipedia.org',
	'github.com',
	'restcountries.com'
	];

	if (highlyTrustedDomains.some(domain => urlObj.hostname.includes(domain))) {
	// Still do a basic check but trust these more
	const response = await fetch(url, {
	method: 'HEAD',
	signal: controller.signal,
	headers: {
	'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
	}
	});

	clearTimeout(timeoutId);
	const isValid = response.status >= 200 && response.status < 400;
	console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
	return isValid;
	}

	// Standard validation for other URLs
	const response = await fetch(url, {
	method: 'HEAD',
	signal: controller.signal,
	headers: {
	'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
	}
	});

	clearTimeout(timeoutId);

	// Consider 2xx and 3xx status codes as valid
	const isValid = response.status >= 200 && response.status < 400;
	console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
	return isValid;

	} catch (error) {
	console.log(`URL ${url} validation failed: ${error instanceof Error ? error.message : String(error)}`);
	return false;
	}
	}

	// Special validation for ArXiv URLs to check if papers actually exist
	async function validateArxivUrl(url: string, signal: AbortSignal): Promise<boolean> {
	try {
	// Extract paper ID from URL
	const match = url.match(/arxiv\.org\/abs\/(.+)$/);
	if (!match) {
	console.log(`Invalid ArXiv URL format: ${url}`);
	return false;
	}

	const paperId = match[1];

	// Validate ArXiv ID format (should be like 2024.12345, cs.AI/1234567, etc.)
	const validFormats = [
	/^\d{4}\.\d{4,5}$/, // New format: 2024.12345
	/^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
	];

	const hasValidFormat = validFormats.some(regex => regex.test(paperId));
	if (!hasValidFormat) {
	console.log(`Invalid ArXiv paper ID format: ${paperId}`);
	return false;
	}

	// Try to fetch the paper to see if it exists
	const response = await fetch(url, {
	method: 'GET', // Need GET to check content
	signal: signal,
	headers: {
	'User-Agent': 'Knowledge-Base-Browser/1.0 (ArXiv Validator)'
	}
	});

	if (!response.ok) {
	console.log(`ArXiv URL returned ${response.status}: ${url}`);
	return false;
	}

	// Check if the response contains error messages
	const content = await response.text();
	const errorIndicators = [
	'not recognized',
	'might instead try to search',
	'article identifier',
	'not found',
	'error'
	];

	const hasError = errorIndicators.some(indicator =>
	content.toLowerCase().includes(indicator.toLowerCase())
	);

	if (hasError) {
	console.log(`ArXiv paper not found: ${url}`);
	return false;
	}

	console.log(`ArXiv URL validation successful: ${url}`);
	return true;

	} catch (error) {
	console.log(`ArXiv URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
	return false;
	}
	}

	// Validation for URLs that might return 200 but show error content
	async function validateContentUrl(url: string, signal: AbortSignal): Promise<boolean> {
	try {
	const response = await fetch(url, {
	method: 'GET', // Need GET to check content
	signal: signal,
	headers: {
	'User-Agent': 'Knowledge-Base-Browser/1.0 (Content Validator)'
	}
	});

	if (!response.ok) {
	console.log(`Content URL returned ${response.status}: ${url}`);
	return false;
	}

	// Check if the response contains common error messages
	const content = await response.text();
	const errorIndicators = [
	'404',
	'not found',
	'page not found',
	'does not exist',
	'error',
	'can\'t be reached',
	'site is temporarily unavailable'
	];

	const hasError = errorIndicators.some(indicator =>
	content.toLowerCase().includes(indicator.toLowerCase())
	);

	if (hasError) {
	console.log(`Content validation failed for: ${url}`);
	return false;
	}

	console.log(`Content URL validation successful: ${url}`);
	return true;

	} catch (error) {
	console.log(`Content URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
	return false;
	}
	}

	// Batch validate multiple URLs with concurrency limit
	async function validateUrls(urls: string[], concurrencyLimit: number = 5): Promise<Map<string, boolean>> {
	const results = new Map<string, boolean>();

	// Process URLs in batches to avoid overwhelming the network
	for (let i = 0; i < urls.length; i += concurrencyLimit) {
	const batch = urls.slice(i, i + concurrencyLimit);
	const batchPromises = batch.map(async (url) => {
	const isValid = await validateUrl(url);
	results.set(url, isValid);
	});

	await Promise.all(batchPromises);
	}

	return results;
	}

	// Enhanced web search using multiple authentic data sources
	async function searchWeb(query: string, maxResults: number = 10): Promise<any[]> {
	const results = [];

	try {
	console.log(`Starting web search for: "${query}"`);

	// 1. Wikipedia search for general knowledge
	try {
	// First try Wikipedia search API
	const wikiSearchUrl = `https://en.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(query.replace(/\s+/g, '_'))}`;
	console.log('Searching Wikipedia:', wikiSearchUrl);

	const wikiResponse = await fetch(wikiSearchUrl, {
	headers: {
	'User-Agent': 'Knowledge-Base-Browser/1.0'
	},
	signal: AbortSignal.timeout(3000) // 3 second timeout
	});

	if (wikiResponse.ok) {
	const wikiData = await wikiResponse.json();
	if (wikiData.extract && wikiData.extract.length > 50) {
	results.push({
	title: wikiData.title,
	content: wikiData.extract,
	url: wikiData.content_urls?.desktop?.page \|\| `https://en.wikipedia.org/wiki/${encodeURIComponent(query)}`,
	source: 'Wikipedia',
	type: 'encyclopedia'
	});
	console.log('Found Wikipedia result:', wikiData.title);
	}
	}
	} catch (wikiError) {
	console.log('Wikipedia search failed:', wikiError instanceof Error ? wikiError.message : String(wikiError));
	}

	// 2. ArXiv search for research papers (for ML/AI/CS topics)
	if (query.toLowerCase().includes('machine learning') \|\|
	query.toLowerCase().includes('neural network') \|\|
	query.toLowerCase().includes('algorithm') \|\|
	query.toLowerCase().includes('artificial intelligence') \|\|
	query.toLowerCase().includes('data science') \|\|
	query.toLowerCase().includes('deep learning')) {

	try {
	const arxivQuery = encodeURIComponent(query);
	const arxivUrl = `http://export.arxiv.org/api/query?search_query=all:${arxivQuery}&start=0&max_results=3&sortBy=relevance&sortOrder=descending`;
	console.log('Searching ArXiv for research papers');

	const arxivResponse = await fetch(arxivUrl, {
	signal: AbortSignal.timeout(5000) // 5 second timeout
	});
	if (arxivResponse.ok) {
	const arxivXml = await arxivResponse.text();

	// Parse ArXiv XML response
	const entries = arxivXml.split('<entry>').slice(1);
	for (const entry of entries.slice(0, 2)) {
	const titleMatch = entry.match(/<title[^>]*>([^<]+)<\/title>/);
	const summaryMatch = entry.match(/<summary[^>]*>([^<]+)<\/summary>/);
	const linkMatch = entry.match(/<id[^>]*>([^<]+)<\/id>/);

	if (titleMatch && summaryMatch && linkMatch) {
	const title = titleMatch[1].trim();
	const summary = summaryMatch[1].trim().substring(0, 300);
	const url = linkMatch[1].trim();

	if (title && summary.length > 50) {
	results.push({
	title: title,
	content: summary,
	url: url,
	source: 'ArXiv Research',
	type: 'research_paper'
	});
	console.log('Found ArXiv paper:', title);
	}
	}
	}
	}
	} catch (arxivError) {
	console.log('ArXiv search failed:', arxivError instanceof Error ? arxivError.message : String(arxivError));
	}
	}

	// 3. Try REST Countries API for country-related queries
	if (query.toLowerCase().includes('country') \|\| query.toLowerCase().includes('nation')) {
	try {
	const countryQuery = query.replace(/country\|nation/gi, '').trim();
	const countryUrl = `https://restcountries.com/v3.1/name/${encodeURIComponent(countryQuery)}`;

	const countryResponse = await fetch(countryUrl, {
	signal: AbortSignal.timeout(3000) // 3 second timeout
	});
	if (countryResponse.ok) {
	const countryData = await countryResponse.json();
	if (Array.isArray(countryData) && countryData.length > 0) {
	const country = countryData[0];
	results.push({
	title: `${country.name.common} - Country Information`,
	content: `${country.name.common} is located in ${country.region}, ${country.subregion}. Capital: ${country.capital?.[0] \|\| 'N/A'}. Population: ${country.population?.toLocaleString() \|\| 'Unknown'}. Official languages: ${Object.values(country.languages \|\| {}).join(', ')}.`,
	url: `https://en.wikipedia.org/wiki/${encodeURIComponent(country.name.common)}`,
	source: 'REST Countries API',
	type: 'geographic'
	});
	console.log('Found country information:', country.name.common);
	}
	}
	} catch (countryError) {
	console.log('Country search failed:', countryError instanceof Error ? countryError.message : String(countryError));
	}
	}

	console.log(`Web search completed. Found ${results.length} results.`);

	// Validate URLs before returning results
	if (results.length > 0) {
	console.log('Validating URLs for accessibility...');
	const urls = results.map(result => result.url);
	const validationResults = await validateUrls(urls);

	// Filter out results with invalid URLs
	const validResults = results.filter(result => {
	const isValid = validationResults.get(result.url);
	if (!isValid) {
	console.log(`Filtered out invalid URL: ${result.url} (${result.title})`);
	}
	return isValid;
	});

	console.log(`URL validation completed. ${validResults.length}/${results.length} URLs are accessible.`);
	return validResults.slice(0, maxResults);
	}

	return results.slice(0, maxResults);
	} catch (error) {
	console.error('Web search error:', error);
	return [];
	}
	}

	// Transform web search results to document format
	function transformWebResultToDocument(result: any, rank: number, query: string): any {
	const snippet = result.content.length > 200 ?
	result.content.substring(0, 200) + '...' :
	result.content;

	return {
	id: `web_${Date.now()}_${rank}`,
	title: result.title,
	content: result.content,
	snippet,
	source: result.source,
	sourceType: 'web',
	url: result.url,
	metadata: {
	search_type: result.type,
	fetched_at: new Date().toISOString()
	},
	relevanceScore: Math.max(0.2, 0.6 - (rank * 0.1)), // Lower scores for external results
	rank: rank + 1,
	searchQuery: query,
	retrievalTime: Math.random() * 0.2 + 0.1,
	tokenCount: Math.floor(result.content.length / 4)
	};
	}

	async function searchGitHubRepos(query: string, maxResults: number = 10): Promise<any[]> {
	try {
	// Parse query to extract author and repository details
	const lowerQuery = query.toLowerCase();
	let searchQuery = '';

	// Check if query contains "by [author]" pattern - handle multiple name formats
	const byAuthorMatch = query.match(/by\s+([a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*)/i);
	if (byAuthorMatch) {
	const authorName = byAuthorMatch[1].trim();
	const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();

	// Try different author search strategies - include multiple language options
	const authorSearches = [
	`${topicPart} user:${authorName.replace(/\s+/g, '')}`, // No language restriction first
	`${topicPart} user:${authorName.replace(/\s+/g, '')} language:python`,
	`${topicPart} user:${authorName.replace(/\s+/g, '')} language:"jupyter notebook"`,
	`${topicPart} "${authorName}"` // Search in description/readme
	];

	// Use the first search strategy
	searchQuery = authorSearches[0];
	} else if (lowerQuery.includes('data structures') \|\| lowerQuery.includes('algorithm')) {
	// Enhanced search for data structures and algorithms
	searchQuery = `${query} "data structures" OR "algorithms" language:python`;
	} else {
	searchQuery = `${query} language:python`;
	}

	console.log('GitHub search query:', searchQuery);

	const response = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(searchQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
	headers: {
	'Authorization': `token ${process.env.GITHUB_TOKEN}`,
	'Accept': 'application/vnd.github.v3+json',
	'User-Agent': 'Knowledge-Base-Browser'
	}
	});

	if (!response.ok) {
	console.error('GitHub API error:', response.status, response.statusText);
	return [];
	}

	const data = await response.json();

	// If no results with author search, try alternative search strategies
	if ((!data.items \|\| data.items.length === 0) && byAuthorMatch) {
	const authorName = byAuthorMatch[1].trim();
	const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();

	// Try different fallback strategies without language restrictions
	const fallbackQueries = [
	`"${authorName}" ${topicPart}`,
	`${topicPart} "${authorName}"`,
	`${authorName} ${topicPart}`,
	`${topicPart} user:${authorName.replace(/\s+/g, '')}`,
	`${topicPart}`
	];

	for (const fallbackQuery of fallbackQueries) {
	console.log('Trying fallback query:', fallbackQuery);

	const fallbackResponse = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(fallbackQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
	headers: {
	'Authorization': `token ${process.env.GITHUB_TOKEN}`,
	'Accept': 'application/vnd.github.v3+json',
	'User-Agent': 'Knowledge-Base-Browser'
	}
	});

	if (fallbackResponse.ok) {
	const fallbackData = await fallbackResponse.json();
	if (fallbackData.items && fallbackData.items.length > 0) {
	// Filter results to prioritize those from the specified author
	const authorFilteredResults = fallbackData.items.filter((repo: any) =>
	repo.owner.login.toLowerCase().includes(authorName.toLowerCase()) \|\|
	repo.full_name.toLowerCase().includes(authorName.toLowerCase()) \|\|
	repo.description?.toLowerCase().includes(authorName.toLowerCase())
	);

	if (authorFilteredResults.length > 0) {
	return authorFilteredResults;
	} else {
	return fallbackData.items;
	}
	}
	}
	}
	}

	const repos = data.items \|\| [];

	// Validate GitHub repository URLs (though GitHub repos are usually reliable)
	if (repos.length > 0) {
	console.log('Validating GitHub repository URLs...');
	const urls = repos.map((repo: GitHubRepo) => repo.html_url);
	const validationResults = await validateUrls(urls);

	// Filter out repos with invalid URLs
	const validRepos = repos.filter((repo: GitHubRepo) => {
	const isValid = validationResults.get(repo.html_url);
	if (!isValid) {
	console.log(`Filtered out invalid GitHub repo: ${repo.html_url} (${repo.full_name})`);
	}
	return isValid;
	});

	console.log(`GitHub URL validation completed. ${validRepos.length}/${repos.length} repositories are accessible.`);
	return validRepos;
	}

	return repos;
	} catch (error) {
	console.error('Error fetching GitHub repos:', error);
	return [];
	}
	}

	function transformGitHubRepoToDocument(repo: GitHubRepo, rank: number, query: string): any {
	const snippet = repo.description ?
	repo.description.substring(0, 200) + (repo.description.length > 200 ? '...' : '') :
	'No description available';

	return {
	id: repo.id,
	title: `${repo.name} - ${repo.full_name}`,
	content: `${repo.description \|\| 'No description available'}\n\nRepository: ${repo.full_name}\nLanguage: ${repo.language}\nStars: ${repo.stargazers_count}\nTopics: ${repo.topics.join(', ')}\nCreated: ${repo.created_at}\nLast Updated: ${repo.updated_at}`,
	snippet,
	source: `GitHub Repository`,
	sourceType: 'code',
	url: repo.html_url,
	metadata: {
	stars: repo.stargazers_count,
	language: repo.language,
	topics: repo.topics,
	created_at: repo.created_at,
	updated_at: repo.updated_at
	},
	relevanceScore: Math.max(0.3, 0.7 - (rank * 0.1)), // Lower scores for GitHub results
	rank: rank + 1,
	searchQuery: query,
	retrievalTime: Math.random() * 0.3 + 0.1,
	tokenCount: Math.floor((repo.description?.length \|\| 100) / 4)
	};
	}

	export async function registerRoutes(app: Express): Promise<Server> {
	// Knowledge graph data endpoint
	app.get("/api/knowledge-graph", async (req, res) => {
	try {
	const documents = await storage.getDocuments(50);

	const nodes: any[] = [];
	const links: any[] = [];

	// Create document nodes from actual storage
	documents.forEach(doc => {
	nodes.push({
	id: `doc_${doc.id}`,
	label: doc.title.substring(0, 50) + (doc.title.length > 50 ? "..." : ""),
	type: "document",
	size: 12,
	color: "#3b82f6",
	metadata: {
	title: doc.title,
	sourceType: doc.sourceType,
	year: new Date(doc.createdAt).getFullYear(),
	id: doc.id
	}
	});
	});

	// Extract concepts from document content
	const conceptMap = new Map<string, number>();
	const conceptToDocuments = new Map<string, number[]>();

	documents.forEach(doc => {
	const content = doc.content.toLowerCase();
	const concepts = [
	'ai', 'artificial intelligence', 'machine learning', 'deep learning',
	'neural networks', 'transformer', 'attention', 'embedding', 'vector',
	'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
	'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
	'alignment', 'reasoning', 'language model', 'nlp', 'computer vision'
	];

	concepts.forEach(concept => {
	if (content.includes(concept)) {
	conceptMap.set(concept, (conceptMap.get(concept) \|\| 0) + 1);
	if (!conceptToDocuments.has(concept)) {
	conceptToDocuments.set(concept, []);
	}
	conceptToDocuments.get(concept)!.push(doc.id);
	}
	});
	});

	// Create document-to-document connections based on shared concepts
	const documentConnections = new Map<string, Set<number>>();

	documents.forEach(doc1 => {
	const doc1Concepts = new Set<string>();
	const content1 = doc1.content.toLowerCase();

	// Enhanced concept detection for better connections
	const allConcepts = [
	'ai', 'artificial intelligence', 'machine learning', 'deep learning',
	'neural networks', 'transformer', 'attention', 'embedding', 'vector',
	'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
	'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
	'alignment', 'reasoning', 'language model', 'nlp', 'computer vision',
	'code generation', 'programming', 'software', 'development', 'copilot',
	'constitutional ai', 'rlhf', 'instruction tuning', 'benchmarks',
	'performance', 'efficiency', 'compression', 'quantization', 'edge ai',
	'mamba', 'mixture of experts', 'moe', 'architecture', 'scaling'
	];

	allConcepts.forEach(concept => {
	if (content1.includes(concept)) {
	doc1Concepts.add(concept);
	}
	});

	// Find related documents with shared concepts
	documents.forEach(doc2 => {
	if (doc1.id !== doc2.id) {
	const content2 = doc2.content.toLowerCase();
	let sharedConcepts = 0;

	doc1Concepts.forEach(concept => {
	if (content2.includes(concept)) {
	sharedConcepts++;
	}
	});

	// Create connection if documents share 3+ concepts
	if (sharedConcepts >= 3) {
	const connectionKey = `${Math.min(doc1.id, doc2.id)}_${Math.max(doc1.id, doc2.id)}`;
	if (!documentConnections.has(connectionKey)) {
	documentConnections.set(connectionKey, new Set([doc1.id, doc2.id]));

	links.push({
	source: `doc_${doc1.id}`,
	target: `doc_${doc2.id}`,
	relationship: "related_concepts",
	strength: Math.min(sharedConcepts / 10, 1),
	color: "#3b82f6"
	});
	}
	}
	}
	});
	});

	// Create concept nodes for concepts that appear in multiple documents
	conceptMap.forEach((count, concept) => {
	if (count >= 2) {
	nodes.push({
	id: `concept_${concept.replace(/\s+/g, '_')}`,
	label: concept,
	type: "concept",
	size: 8 + count * 2,
	color: "#10b981",
	metadata: {
	documentCount: count,
	concept: concept
	}
	});

	// Link concept to documents
	const relatedDocs = conceptToDocuments.get(concept) \|\| [];
	relatedDocs.forEach(docId => {
	links.push({
	source: `doc_${docId}`,
	target: `concept_${concept.replace(/\s+/g, '_')}`,
	relationship: "contains_concept",
	strength: 1,
	color: "#10b981"
	});
	});
	}
	});

	// Extract research teams from document metadata
	const researchTeams = new Map<string, number[]>();
	documents.forEach(doc => {
	if (doc.metadata) {
	let teamName = '';
	const metadata = typeof doc.metadata === 'string' ? JSON.parse(doc.metadata) : doc.metadata;

	// Extract team names from authors or venue
	if (metadata.authors && Array.isArray(metadata.authors)) {
	// Use first author's affiliation or create team from venue
	teamName = metadata.venue \|\| 'Research Team';
	} else if (metadata.venue) {
	teamName = metadata.venue;
	} else if (doc.source) {
	// Extract team from source
	if (doc.source.includes('OpenAI')) teamName = 'OpenAI Research';
	else if (doc.source.includes('Anthropic')) teamName = 'Anthropic';
	else if (doc.source.includes('Google') \|\| doc.source.includes('DeepMind')) teamName = 'Google DeepMind';
	else if (doc.source.includes('LangChain')) teamName = 'LangChain Team';
	else if (doc.source.includes('Research Collective')) teamName = 'AI Research Collective';
	else teamName = 'Research Community';
	}

	if (teamName) {
	if (!researchTeams.has(teamName)) {
	researchTeams.set(teamName, []);
	}
	researchTeams.get(teamName)!.push(doc.id);
	}
	}
	});

	// Create research team nodes
	researchTeams.forEach((docIds, teamName) => {
	nodes.push({
	id: `team_${teamName.replace(/\s+/g, '_')}`,
	label: teamName,
	type: "author",
	size: 8 + docIds.length * 2,
	color: "#f59e0b",
	metadata: {
	teamName: teamName,
	publicationCount: docIds.length
	}
	});

	// Link team to documents
	docIds.forEach(docId => {
	links.push({
	source: `team_${teamName.replace(/\s+/g, '_')}`,
	target: `doc_${docId}`,
	relationship: "authored_by",
	strength: 0.8,
	color: "#f59e0b"
	});
	});
	});

	// Create source type clusters
	const sourceTypes = new Map<string, number[]>();
	documents.forEach(doc => {
	const sourceType = doc.sourceType \|\| 'unknown';
	if (!sourceTypes.has(sourceType)) {
	sourceTypes.set(sourceType, []);
	}
	sourceTypes.get(sourceType)!.push(doc.id);
	});

	sourceTypes.forEach((docIds, sourceType) => {
	if (docIds.length >= 2) {
	nodes.push({
	id: `source_${sourceType}`,
	label: sourceType.charAt(0).toUpperCase() + sourceType.slice(1),
	type: "topic",
	size: 10,
	color: "#8b5cf6",
	metadata: {
	sourceType: sourceType,
	documentCount: docIds.length
	}
	});

	// Link source type to documents
	docIds.forEach(docId => {
	links.push({
	source: `source_${sourceType}`,
	target: `doc_${docId}`,
	relationship: "categorized_as",
	strength: 0.6,
	color: "#8b5cf6"
	});
	});
	}
	});

	res.json({
	nodes,
	links,
	stats: {
	totalDocuments: documents.length,
	totalConcepts: conceptMap.size,
	totalResearchTeams: researchTeams.size,
	totalSourceTypes: sourceTypes.size
	}
	});

	} catch (error) {
	console.error("Knowledge graph generation failed:", error);
	res.status(500).json({
	error: "Failed to generate knowledge graph",
	nodes: [],
	links: [],
	stats: { totalDocuments: 0, totalConcepts: 0, totalResearchTeams: 0, totalSourceTypes: 0 }
	});
	}
	});

	// Enhanced search with web fallback
	app.post("/api/search", async (req, res) => {
	try {
	const searchRequest = searchRequestSchema.parse(req.body);
	const streaming = req.body.streaming === true;
	const startTime = Date.now();

	let allDocuments: any[] = [];

	// Enhanced multi-source search for semantic queries
	if (searchRequest.searchType === "semantic") {
	console.log(`🔍 Enhanced multi-source search for: "${searchRequest.query}"`);

	// 1. First, always do keyword search on knowledge base
	console.log('📚 Searching knowledge base...');

	// Enhanced query expansion with multiple search attempts
	const queryLower = searchRequest.query.toLowerCase();
	const searchQueries = [searchRequest.query]; // Start with original query

	// Add related terms for better matching
	if (queryLower.includes('mistral')) {
	searchQueries.push('Mixtral', 'Mistral AI');
	}
	if (queryLower.includes('mixtral')) {
	searchQueries.push('Mistral', 'mixture of experts');
	}
	if (queryLower.includes('llama')) {
	searchQueries.push('LLaMA', 'Large Language Model Meta AI');
	}
	if (queryLower.includes('gpt')) {
	searchQueries.push('GPT', 'Generative Pre-trained Transformer');
	}
	if (queryLower.includes('transformer') \|\| queryLower.includes('attention')) {
	searchQueries.push('Attention Is All You Need', 'transformer', 'attention mechanism');
	}
	if (queryLower.includes('constitutional')) {
	searchQueries.push('Constitutional AI', 'harmlessness', 'AI feedback');
	}
	if (queryLower.includes('rag') \|\| queryLower.includes('retrieval')) {
	searchQueries.push('Retrieval-Augmented Generation', 'retrieval augmented', 'knowledge-intensive');
	}

	// Search with each query and combine results
	const allSearchResults = new Map<number, any>();

	for (const query of searchQueries) {
	const searchResult = await storage.searchDocuments({ ...searchRequest, query });
	for (const doc of searchResult.results \|\| []) {
	if (!allSearchResults.has(doc.id)) {
	// Boost relevance for exact matches with expanded terms
	let relevanceBoost = 0;
	if (query !== searchRequest.query) {
	relevanceBoost = 0.2; // Boost expanded term matches
	}

	allSearchResults.set(doc.id, {
	...doc,
	relevanceScore: Math.min(doc.relevanceScore + relevanceBoost, 1.0)
	});
	}
	}
	}

	allDocuments = Array.from(allSearchResults.values());

	allDocuments = allDocuments.map(doc => ({
	...doc,
	relevanceScore: Math.min(doc.relevanceScore + 0.6, 1.0), // Boost local results
	rank: doc.rank,
	snippet: doc.snippet \|\| doc.content.substring(0, 200) + '...'
	}));

	console.log(`📚 Found ${allDocuments.length} local documents`);

	console.log(`📚 Query expansion searched for: ${searchQueries.join(', ')}`);

	// Skip AI enhancement for now to test query expansion
	// TODO: Re-enable AI enhancement after fixing query expansion
	} else {
	// Use regular keyword search for other search types
	const localResults = await storage.searchDocuments(searchRequest);
	// Boost relevance scores for knowledge base documents to prioritize them
	allDocuments = (localResults.results \|\| []).map(doc => ({
	...doc,
	relevanceScore: Math.min(doc.relevanceScore + 0.5, 1.0) // Boost by 0.5
	}));
	}

	// Validate URLs in local storage results as well
	if (allDocuments.length > 0) {
	console.log('Validating URLs in local storage results...');
	const documentsWithUrls = allDocuments.filter(doc => doc.url);

	if (documentsWithUrls.length > 0) {
	const urls = documentsWithUrls.map(doc => doc.url).filter((url): url is string => url !== null);
	const validationResults = await validateUrls(urls);

	// Filter out documents with invalid URLs
	allDocuments = allDocuments.filter(doc => {
	if (!doc.url) return true; // Keep documents without URLs

	const isValid = validationResults.get(doc.url);
	if (!isValid) {
	console.log(`Filtered out local document with invalid URL: ${doc.url} (${doc.title})`);
	}
	return isValid;
	});

	console.log(`Local URL validation completed. ${allDocuments.length} documents have valid URLs.`);
	}
	}

	// Always search external sources to provide comprehensive results
	console.log(`🌐 Searching external sources to supplement ${allDocuments.length} local results...`);

	// Check if we should search GitHub
	const isCodeQuery = searchRequest.query.toLowerCase().includes('python') \|\|
	searchRequest.query.toLowerCase().includes('data structures') \|\|
	searchRequest.query.toLowerCase().includes('algorithm') \|\|
	searchRequest.query.toLowerCase().includes('repository') \|\|
	searchRequest.query.toLowerCase().includes('code') \|\|
	searchRequest.query.toLowerCase().includes('programming') \|\|
	searchRequest.query.toLowerCase().includes('github');

	// Enhanced keyword detection for AI/ML queries that might have relevant code
	const isAIQuery = searchRequest.query.toLowerCase().includes('mistral') \|\|
	searchRequest.query.toLowerCase().includes('llama') \|\|
	searchRequest.query.toLowerCase().includes('transformer') \|\|
	searchRequest.query.toLowerCase().includes('gpt') \|\|
	searchRequest.query.toLowerCase().includes('ai') \|\|
	searchRequest.query.toLowerCase().includes('machine learning') \|\|
	searchRequest.query.toLowerCase().includes('neural network');

	// Query analysis for external search triggers

	// Enhanced external search with better error handling and timeouts
	const externalSearchPromises = [];

	// GitHub search for code and AI-related queries
	if ((isCodeQuery \|\| isAIQuery) && process.env.GITHUB_TOKEN) {
	console.log('🐙 Searching GitHub...');
	externalSearchPromises.push(
	Promise.race([
	searchGitHubRepos(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
	.then(repos => ({
	type: 'github',
	results: repos.map((repo, index) =>
	transformGitHubRepoToDocument(repo, index + allDocuments.length, searchRequest.query)
	)
	}))
	.catch(error => {
	console.log('🐙 GitHub search failed:', error.message);
	return { type: 'github', results: [] };
	}),
	new Promise((_, reject) =>
	setTimeout(() => reject(new Error('GitHub search timeout')), 8000)
	)
	]).catch(() => ({ type: 'github', results: [] }))
	);
	}

	// Always include web search for comprehensive coverage
	console.log('🌍 Searching web...');
	externalSearchPromises.push(
	Promise.race([
	searchWeb(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
	.then(webResults => ({
	type: 'web',
	results: webResults.map((result, index) =>
	transformWebResultToDocument(result, index + allDocuments.length, searchRequest.query)
	)
	}))
	.catch(error => {
	console.log('🌍 Web search failed:', error.message);
	return { type: 'web', results: [] };
	}),
	new Promise((_, reject) =>
	setTimeout(() => reject(new Error('Web search timeout')), 5000)
	)
	]).catch(() => ({ type: 'web', results: [] }))
	);

	// Wait for external searches with timeout protection
	if (externalSearchPromises.length > 0) {
	try {
	const externalResults = await Promise.all(externalSearchPromises);

	// Flatten and combine results
	const githubResult = externalResults.find((r: any) => r?.type === 'github') as any;
	const webResult = externalResults.find((r: any) => r?.type === 'web') as any;
	const githubResults = githubResult?.results \|\| [];
	const webResults = webResult?.results \|\| [];
	const allExternalResults = [...githubResults, ...webResults];

	console.log(`🌐 Found ${allExternalResults.length} external results (GitHub: ${githubResults.length}, Web: ${webResults.length})`);

	// Combine local and external results, keeping local results prioritized
	if (allExternalResults.length > 0) {
	allDocuments = [...allDocuments, ...allExternalResults]
	.sort((a, b) => b.relevanceScore - a.relevanceScore)
	.slice(0, searchRequest.limit);
	}
	} catch (externalError: any) {
	console.log('🌐 External search failed:', externalError?.message \|\| externalError);
	}
	}

	console.log(`✅ Total results: ${allDocuments.length}`);

	const searchTime = (Date.now() - startTime) / 1000;
	const response = {
	results: allDocuments,
	totalCount: allDocuments.length,
	searchTime,
	query: searchRequest.query,
	queryId: Date.now()
	};

	res.json(response);
	} catch (error) {
	if (error instanceof z.ZodError) {
	res.status(400).json({ message: "Invalid search request", errors: error.errors });
	} else {
	console.error('Search error:', error);
	res.status(500).json({ message: "Internal server error" });
	}
	}
	});

	// AI explanation endpoint using Nebius
	app.post("/api/explain", async (req, res) => {
	try {
	const { title, snippet, content } = req.body;

	if (!title \|\| !snippet) {
	return res.status(400).json({ message: "Title and snippet are required" });
	}

	const prompt = `You are an expert communicator. Explain this document directly in a clear, conversational way suitable for audio playback. Do not show your thinking process - just provide the final explanation.

	Title: ${title}
	Content: ${snippet}

	Provide a brief, engaging explanation (2-3 sentences) that would be pleasant to listen to. Focus on the key concepts and practical value. Start your response immediately with the explanation.`;

	const response = await nebiusClient.createChatCompletion({
	model: "deepseek-ai/DeepSeek-R1-0528", // Using DeepSeek model via Nebius
	messages: [{ role: "user", content: prompt }],
	max_tokens: 150,
	temperature: 0.7,
	});

	const explanation = cleanThinkingTags(response.choices[0].message.content);
	res.json({ explanation });
	} catch (error) {
	console.error('AI explanation error:', error);
	res.status(500).json({ message: "Failed to generate explanation" });
	}
	});

	// Enhanced AI-powered search using Nebius and Modal
	app.post("/api/ai-search", async (req, res) => {
	try {
	const { query, maxResults = 10, useQueryEnhancement = true } = req.body;

	if (!query \|\| typeof query !== 'string') {
	return res.status(400).json({ message: "Query is required" });
	}

	const results = await smartIngestionService.enhancedSearch(query, {
	maxResults,
	searchType: 'semantic',
	useQueryEnhancement
	});

	res.json(results);
	} catch (error) {
	console.error('AI search error:', error);
	res.status(500).json({
	message: "AI search failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Document analysis using Nebius AI
	app.post("/api/analyze-document", async (req, res) => {
	try {
	const { content, analysisType = 'summary', useMarkdown = true } = req.body;

	if (!content) {
	return res.status(400).json({ message: "Content is required" });
	}

	const analysis = await nebiusClient.analyzeDocument({
	content,
	analysisType,
	useMarkdown
	});

	res.json(analysis);
	} catch (error) {
	console.error('Document analysis error:', error);
	res.status(500).json({
	message: "Document analysis failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Research synthesis using Nebius AI
	app.post("/api/research-synthesis", async (req, res) => {
	try {
	const { query, documentIds } = req.body;

	if (!query \|\| !Array.isArray(documentIds)) {
	return res.status(400).json({ message: "Query and document IDs are required" });
	}

	// Get documents from storage
	const documents = await Promise.all(
	documentIds.map(id => storage.getDocument(id))
	);

	const validDocuments = documents.filter(Boolean);

	if (validDocuments.length === 0) {
	return res.status(400).json({ message: "No valid documents found" });
	}

	const synthesis = await smartIngestionService.generateResearchSynthesis(
	query,
	validDocuments
	);

	res.json(synthesis);
	} catch (error) {
	console.error('Research synthesis error:', error);
	res.status(500).json({
	message: "Research synthesis failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Query enhancement using Nebius AI
	app.post("/api/enhance-query", async (req, res) => {
	try {
	const { query, context } = req.body;

	if (!query) {
	return res.status(400).json({ message: "Query is required" });
	}

	const enhancement = await nebiusClient.enhanceQuery(query, context);

	// Clean up any thinking tags that might appear in string fields
	enhancement.enhancedQuery = cleanThinkingTags(enhancement.enhancedQuery);
	enhancement.intent = cleanThinkingTags(enhancement.intent);

	res.json(enhancement);
	} catch (error) {
	console.error('Query enhancement error:', error);
	res.status(500).json({
	message: "Query enhancement failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Modal processing status endpoint
	app.get("/api/modal-task/:taskId", async (req, res) => {
	try {
	const { taskId } = req.params;
	const status = await modalClient.getTaskStatus(taskId);
	res.json(status);
	} catch (error) {
	console.error('Modal task status error:', error);
	res.status(500).json({
	message: "Failed to get task status",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Batch document ingestion using Modal
	app.post("/api/batch-ingest", async (req, res) => {
	try {
	const { documents } = req.body;

	if (!Array.isArray(documents) \|\| documents.length === 0) {
	return res.status(400).json({ message: "Documents array is required" });
	}

	const uploads = documents.map(doc => ({
	file: doc.content \|\| '',
	filename: doc.filename \|\| 'unknown.txt',
	contentType: doc.contentType \|\| 'text/plain',
	metadata: doc.metadata \|\| {}
	}));

	const result = await smartIngestionService.batchIngestDocuments(uploads);
	res.json(result);
	} catch (error) {
	console.error('Batch ingestion error:', error);
	res.status(500).json({
	message: "Batch ingestion failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// API Health Check endpoint
	app.get("/api/health", async (req, res) => {
	try {
	const { checkAPIHealth } = await import('./api-health-check');
	const healthStatus = await checkAPIHealth();

	const overallHealthy = healthStatus.every(status => status.status !== 'error');

	res.status(overallHealthy ? 200 : 503).json({
	overall: overallHealthy ? 'healthy' : 'issues_detected',
	services: healthStatus,
	timestamp: new Date().toISOString()
	});
	} catch (error) {
	res.status(500).json({
	overall: 'error',
	message: 'Health check failed',
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Generate embeddings using Nebius
	app.post("/api/embeddings", async (req, res) => {
	try {
	const { input, model = 'text-embedding-ada-002' } = req.body;

	if (!input) {
	return res.status(400).json({ message: "Input text is required" });
	}

	console.log('Generating embeddings for input:', input.substring(0, 100) + '...');
	const embeddings = await nebiusClient.createEmbeddings({ input, model });
	console.log('Embeddings generated successfully');
	res.json(embeddings);
	} catch (error) {
	console.error('Embeddings error:', error);
	res.status(500).json({
	message: "Embedding generation failed",
	error: error instanceof Error ? error.message : 'Unknown error'
	});
	}
	});

	// Other routes...
	app.get("/api/documents", async (req, res) => {
	try {
	const limit = parseInt(req.query.limit as string) \|\| 50;
	const offset = parseInt(req.query.offset as string) \|\| 0;
	const documents = await storage.getDocuments(limit, offset);
	res.json(documents);
	} catch (error) {
	res.status(500).json({ message: "Failed to fetch documents" });
	}
	});

	// Register document routes - enable uploads by default for all environments
	// Hugging Face Spaces have /tmp storage which is suitable for uploads
	const isHuggingFaceSpace = process.env.SPACE_ID \|\| process.env.HF_SPACE_ID \|\|
	process.env.HUGGINGFACE_SPACE_ID \|\| process.env.HF_TOKEN \|\| false;
	const hasWritableStorage = process.env.NODE_ENV === 'production' ?
	fs.existsSync('/tmp') :
	true; // Development always has writable storage

	// Force enable uploads for Hugging Face Spaces, otherwise check DISABLE_UPLOADS
	const isDocumentUploadEnabled = isHuggingFaceSpace ? true : (process.env.DISABLE_UPLOADS !== 'true');

	console.log('🔍 Environment check:', {
	NODE_ENV: process.env.NODE_ENV,
	DISABLE_UPLOADS: process.env.DISABLE_UPLOADS,
	isHuggingFaceSpace: !!isHuggingFaceSpace,
	hasWritableStorage,
	isDocumentUploadEnabled
	});

	if (isDocumentUploadEnabled) {
	console.log('✅ Document uploads enabled - full functionality available');
	app.use("/api/documents", documentRoutes);
	} else {
	console.log('ℹ️ Document uploads disabled - using fallback routes');
	app.use("/api/documents", uploadFallbackRoutes);
	}

	const httpServer = createServer(app);
	return httpServer;
	}