Spaces:

Agents-MCP-Hackathon
/

KnowledgeBridge

Running

File size: 49,034 Bytes

import { Express } from "express";
import { createServer, Server } from "http";
import { z } from "zod";
import fs from "fs";
import { storage } from "./storage";
import { searchRequestSchema } from "@shared/schema";
import { smartIngestionService } from "./smart-ingestion";
import { nebiusClient } from "./nebius-client";
import { modalClient } from "./modal-client";
import documentRoutes from "./document-routes";
import uploadFallbackRoutes from "./upload-fallback";

interface GitHubRepo {
  id: number;
  name: string;
  full_name: string;
  description: string;
  html_url: string;
  stargazers_count: number;
  language: string;
  topics: string[];
  created_at: string;
  updated_at: string;
}

// Using Nebius client instead of OpenAI for all AI operations

// Helper function to clean up DeepSeek R1 thinking tags
function cleanThinkingTags(text: string): string {
  if (typeof text === 'string' && text.includes('<think>')) {
    // First try to remove complete <think>...</think> pairs
    let cleaned = text.replace(/<think>[\s\S]*?<\/think>\s*/g, '');
    
    // If thinking tags remain (e.g., unclosed), remove everything from <think> onwards
    if (cleaned.includes('<think>')) {
      cleaned = cleaned.substring(0, cleaned.indexOf('<think>'));
    }
    
    return cleaned.trim();
  }
  return text;
}

// URL validation utility to check if websites are accessible and content is valid
async function validateUrl(url: string, timeout: number = 5000): Promise<boolean> {
  try {
    console.log(`Validating URL: ${url}`);
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), timeout);
    
    const urlObj = new URL(url);
    
    // Special handling for ArXiv URLs to validate paper existence
    if (urlObj.hostname.includes('arxiv.org')) {
      return await validateArxivUrl(url, controller.signal);
    }
    
    // Special handling for other domains that might return 200 but show error pages
    if (urlObj.hostname.includes('vldb.org') || 
        urlObj.hostname.includes('cvpr.org') ||
        urlObj.hostname.includes('icse.org')) {
      return await validateContentUrl(url, controller.signal);
    }
    
    // Fast path for highly trusted domains
    const highlyTrustedDomains = [
      'wikipedia.org',
      'github.com',
      'restcountries.com'
    ];
    
    if (highlyTrustedDomains.some(domain => urlObj.hostname.includes(domain))) {
      // Still do a basic check but trust these more
      const response = await fetch(url, {
        method: 'HEAD',
        signal: controller.signal,
        headers: {
          'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
        }
      });
      
      clearTimeout(timeoutId);
      const isValid = response.status >= 200 && response.status < 400;
      console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
      return isValid;
    }
    
    // Standard validation for other URLs
    const response = await fetch(url, {
      method: 'HEAD',
      signal: controller.signal,
      headers: {
        'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)'
      }
    });
    
    clearTimeout(timeoutId);
    
    // Consider 2xx and 3xx status codes as valid
    const isValid = response.status >= 200 && response.status < 400;
    console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`);
    return isValid;
    
  } catch (error) {
    console.log(`URL ${url} validation failed: ${error instanceof Error ? error.message : String(error)}`);
    return false;
  }
}

// Special validation for ArXiv URLs to check if papers actually exist
async function validateArxivUrl(url: string, signal: AbortSignal): Promise<boolean> {
  try {
    // Extract paper ID from URL
    const match = url.match(/arxiv\.org\/abs\/(.+)$/);
    if (!match) {
      console.log(`Invalid ArXiv URL format: ${url}`);
      return false;
    }
    
    const paperId = match[1];
    
    // Validate ArXiv ID format (should be like 2024.12345, cs.AI/1234567, etc.)
    const validFormats = [
      /^\d{4}\.\d{4,5}$/, // New format: 2024.12345
      /^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
    ];
    
    const hasValidFormat = validFormats.some(regex => regex.test(paperId));
    if (!hasValidFormat) {
      console.log(`Invalid ArXiv paper ID format: ${paperId}`);
      return false;
    }
    
    // Try to fetch the paper to see if it exists
    const response = await fetch(url, {
      method: 'GET', // Need GET to check content
      signal: signal,
      headers: {
        'User-Agent': 'Knowledge-Base-Browser/1.0 (ArXiv Validator)'
      }
    });
    
    if (!response.ok) {
      console.log(`ArXiv URL returned ${response.status}: ${url}`);
      return false;
    }
    
    // Check if the response contains error messages
    const content = await response.text();
    const errorIndicators = [
      'not recognized',
      'might instead try to search',
      'article identifier',
      'not found',
      'error'
    ];
    
    const hasError = errorIndicators.some(indicator => 
      content.toLowerCase().includes(indicator.toLowerCase())
    );
    
    if (hasError) {
      console.log(`ArXiv paper not found: ${url}`);
      return false;
    }
    
    console.log(`ArXiv URL validation successful: ${url}`);
    return true;
    
  } catch (error) {
    console.log(`ArXiv URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
    return false;
  }
}

// Validation for URLs that might return 200 but show error content
async function validateContentUrl(url: string, signal: AbortSignal): Promise<boolean> {
  try {
    const response = await fetch(url, {
      method: 'GET', // Need GET to check content
      signal: signal,
      headers: {
        'User-Agent': 'Knowledge-Base-Browser/1.0 (Content Validator)'
      }
    });
    
    if (!response.ok) {
      console.log(`Content URL returned ${response.status}: ${url}`);
      return false;
    }
    
    // Check if the response contains common error messages
    const content = await response.text();
    const errorIndicators = [
      '404',
      'not found',
      'page not found',
      'does not exist',
      'error',
      'can\'t be reached',
      'site is temporarily unavailable'
    ];
    
    const hasError = errorIndicators.some(indicator => 
      content.toLowerCase().includes(indicator.toLowerCase())
    );
    
    if (hasError) {
      console.log(`Content validation failed for: ${url}`);
      return false;
    }
    
    console.log(`Content URL validation successful: ${url}`);
    return true;
    
  } catch (error) {
    console.log(`Content URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`);
    return false;
  }
}

// Batch validate multiple URLs with concurrency limit
async function validateUrls(urls: string[], concurrencyLimit: number = 5): Promise<Map<string, boolean>> {
  const results = new Map<string, boolean>();
  
  // Process URLs in batches to avoid overwhelming the network
  for (let i = 0; i < urls.length; i += concurrencyLimit) {
    const batch = urls.slice(i, i + concurrencyLimit);
    const batchPromises = batch.map(async (url) => {
      const isValid = await validateUrl(url);
      results.set(url, isValid);
    });
    
    await Promise.all(batchPromises);
  }
  
  return results;
}

// Enhanced web search using multiple authentic data sources
async function searchWeb(query: string, maxResults: number = 10): Promise<any[]> {
  const results = [];
  
  try {
    console.log(`Starting web search for: "${query}"`);
    
    // 1. Wikipedia search for general knowledge
    try {
      // First try Wikipedia search API
      const wikiSearchUrl = `https://en.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(query.replace(/\s+/g, '_'))}`;
      console.log('Searching Wikipedia:', wikiSearchUrl);
      
      const wikiResponse = await fetch(wikiSearchUrl, {
        headers: {
          'User-Agent': 'Knowledge-Base-Browser/1.0'
        },
        signal: AbortSignal.timeout(3000) // 3 second timeout
      });
      
      if (wikiResponse.ok) {
        const wikiData = await wikiResponse.json();
        if (wikiData.extract && wikiData.extract.length > 50) {
          results.push({
            title: wikiData.title,
            content: wikiData.extract,
            url: wikiData.content_urls?.desktop?.page || `https://en.wikipedia.org/wiki/${encodeURIComponent(query)}`,
            source: 'Wikipedia',
            type: 'encyclopedia'
          });
          console.log('Found Wikipedia result:', wikiData.title);
        }
      }
    } catch (wikiError) {
      console.log('Wikipedia search failed:', wikiError instanceof Error ? wikiError.message : String(wikiError));
    }

    // 2. ArXiv search for research papers (for ML/AI/CS topics)
    if (query.toLowerCase().includes('machine learning') || 
        query.toLowerCase().includes('neural network') || 
        query.toLowerCase().includes('algorithm') ||
        query.toLowerCase().includes('artificial intelligence') ||
        query.toLowerCase().includes('data science') ||
        query.toLowerCase().includes('deep learning')) {
      
      try {
        const arxivQuery = encodeURIComponent(query);
        const arxivUrl = `http://export.arxiv.org/api/query?search_query=all:${arxivQuery}&start=0&max_results=3&sortBy=relevance&sortOrder=descending`;
        console.log('Searching ArXiv for research papers');
        
        const arxivResponse = await fetch(arxivUrl, {
          signal: AbortSignal.timeout(5000) // 5 second timeout
        });
        if (arxivResponse.ok) {
          const arxivXml = await arxivResponse.text();
          
          // Parse ArXiv XML response
          const entries = arxivXml.split('<entry>').slice(1);
          for (const entry of entries.slice(0, 2)) {
            const titleMatch = entry.match(/<title[^>]*>([^<]+)<\/title>/);
            const summaryMatch = entry.match(/<summary[^>]*>([^<]+)<\/summary>/);
            const linkMatch = entry.match(/<id[^>]*>([^<]+)<\/id>/);
            
            if (titleMatch && summaryMatch && linkMatch) {
              const title = titleMatch[1].trim();
              const summary = summaryMatch[1].trim().substring(0, 300);
              const url = linkMatch[1].trim();
              
              if (title && summary.length > 50) {
                results.push({
                  title: title,
                  content: summary,
                  url: url,
                  source: 'ArXiv Research',
                  type: 'research_paper'
                });
                console.log('Found ArXiv paper:', title);
              }
            }
          }
        }
      } catch (arxivError) {
        console.log('ArXiv search failed:', arxivError instanceof Error ? arxivError.message : String(arxivError));
      }
    }

    // 3. Try REST Countries API for country-related queries
    if (query.toLowerCase().includes('country') || query.toLowerCase().includes('nation')) {
      try {
        const countryQuery = query.replace(/country|nation/gi, '').trim();
        const countryUrl = `https://restcountries.com/v3.1/name/${encodeURIComponent(countryQuery)}`;
        
        const countryResponse = await fetch(countryUrl, {
          signal: AbortSignal.timeout(3000) // 3 second timeout
        });
        if (countryResponse.ok) {
          const countryData = await countryResponse.json();
          if (Array.isArray(countryData) && countryData.length > 0) {
            const country = countryData[0];
            results.push({
              title: `${country.name.common} - Country Information`,
              content: `${country.name.common} is located in ${country.region}, ${country.subregion}. Capital: ${country.capital?.[0] || 'N/A'}. Population: ${country.population?.toLocaleString() || 'Unknown'}. Official languages: ${Object.values(country.languages || {}).join(', ')}.`,
              url: `https://en.wikipedia.org/wiki/${encodeURIComponent(country.name.common)}`,
              source: 'REST Countries API',
              type: 'geographic'
            });
            console.log('Found country information:', country.name.common);
          }
        }
      } catch (countryError) {
        console.log('Country search failed:', countryError instanceof Error ? countryError.message : String(countryError));
      }
    }

    console.log(`Web search completed. Found ${results.length} results.`);
    
    // Validate URLs before returning results
    if (results.length > 0) {
      console.log('Validating URLs for accessibility...');
      const urls = results.map(result => result.url);
      const validationResults = await validateUrls(urls);
      
      // Filter out results with invalid URLs
      const validResults = results.filter(result => {
        const isValid = validationResults.get(result.url);
        if (!isValid) {
          console.log(`Filtered out invalid URL: ${result.url} (${result.title})`);
        }
        return isValid;
      });
      
      console.log(`URL validation completed. ${validResults.length}/${results.length} URLs are accessible.`);
      return validResults.slice(0, maxResults);
    }
    
    return results.slice(0, maxResults);
  } catch (error) {
    console.error('Web search error:', error);
    return [];
  }
}

// Transform web search results to document format
function transformWebResultToDocument(result: any, rank: number, query: string): any {
  const snippet = result.content.length > 200 ? 
    result.content.substring(0, 200) + '...' : 
    result.content;
    
  return {
    id: `web_${Date.now()}_${rank}`,
    title: result.title,
    content: result.content,
    snippet,
    source: result.source,
    sourceType: 'web',
    url: result.url,
    metadata: {
      search_type: result.type,
      fetched_at: new Date().toISOString()
    },
    relevanceScore: Math.max(0.2, 0.6 - (rank * 0.1)), // Lower scores for external results
    rank: rank + 1,
    searchQuery: query,
    retrievalTime: Math.random() * 0.2 + 0.1,
    tokenCount: Math.floor(result.content.length / 4)
  };
}

async function searchGitHubRepos(query: string, maxResults: number = 10): Promise<any[]> {
  try {
    // Parse query to extract author and repository details
    const lowerQuery = query.toLowerCase();
    let searchQuery = '';
    
    // Check if query contains "by [author]" pattern - handle multiple name formats
    const byAuthorMatch = query.match(/by\s+([a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*)/i);
    if (byAuthorMatch) {
      const authorName = byAuthorMatch[1].trim();
      const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();
      
      // Try different author search strategies - include multiple language options
      const authorSearches = [
        `${topicPart} user:${authorName.replace(/\s+/g, '')}`, // No language restriction first
        `${topicPart} user:${authorName.replace(/\s+/g, '')} language:python`,
        `${topicPart} user:${authorName.replace(/\s+/g, '')} language:"jupyter notebook"`,
        `${topicPart} "${authorName}"` // Search in description/readme
      ];
      
      // Use the first search strategy
      searchQuery = authorSearches[0];
    } else if (lowerQuery.includes('data structures') || lowerQuery.includes('algorithm')) {
      // Enhanced search for data structures and algorithms
      searchQuery = `${query} "data structures" OR "algorithms" language:python`;
    } else {
      searchQuery = `${query} language:python`;
    }

    console.log('GitHub search query:', searchQuery);

    const response = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(searchQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
      headers: {
        'Authorization': `token ${process.env.GITHUB_TOKEN}`,
        'Accept': 'application/vnd.github.v3+json',
        'User-Agent': 'Knowledge-Base-Browser'
      }
    });

    if (!response.ok) {
      console.error('GitHub API error:', response.status, response.statusText);
      return [];
    }

    const data = await response.json();
    
    // If no results with author search, try alternative search strategies
    if ((!data.items || data.items.length === 0) && byAuthorMatch) {
      const authorName = byAuthorMatch[1].trim();
      const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim();
      
      // Try different fallback strategies without language restrictions
      const fallbackQueries = [
        `"${authorName}" ${topicPart}`,
        `${topicPart} "${authorName}"`,
        `${authorName} ${topicPart}`,
        `${topicPart} user:${authorName.replace(/\s+/g, '')}`,
        `${topicPart}`
      ];
      
      for (const fallbackQuery of fallbackQueries) {
        console.log('Trying fallback query:', fallbackQuery);
        
        const fallbackResponse = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(fallbackQuery)}&sort=stars&order=desc&per_page=${maxResults}`, {
          headers: {
            'Authorization': `token ${process.env.GITHUB_TOKEN}`,
            'Accept': 'application/vnd.github.v3+json',
            'User-Agent': 'Knowledge-Base-Browser'
          }
        });
        
        if (fallbackResponse.ok) {
          const fallbackData = await fallbackResponse.json();
          if (fallbackData.items && fallbackData.items.length > 0) {
            // Filter results to prioritize those from the specified author
            const authorFilteredResults = fallbackData.items.filter((repo: any) => 
              repo.owner.login.toLowerCase().includes(authorName.toLowerCase()) ||
              repo.full_name.toLowerCase().includes(authorName.toLowerCase()) ||
              repo.description?.toLowerCase().includes(authorName.toLowerCase())
            );
            
            if (authorFilteredResults.length > 0) {
              return authorFilteredResults;
            } else {
              return fallbackData.items;
            }
          }
        }
      }
    }
    
    const repos = data.items || [];
    
    // Validate GitHub repository URLs (though GitHub repos are usually reliable)
    if (repos.length > 0) {
      console.log('Validating GitHub repository URLs...');
      const urls = repos.map((repo: GitHubRepo) => repo.html_url);
      const validationResults = await validateUrls(urls);
      
      // Filter out repos with invalid URLs
      const validRepos = repos.filter((repo: GitHubRepo) => {
        const isValid = validationResults.get(repo.html_url);
        if (!isValid) {
          console.log(`Filtered out invalid GitHub repo: ${repo.html_url} (${repo.full_name})`);
        }
        return isValid;
      });
      
      console.log(`GitHub URL validation completed. ${validRepos.length}/${repos.length} repositories are accessible.`);
      return validRepos;
    }
    
    return repos;
  } catch (error) {
    console.error('Error fetching GitHub repos:', error);
    return [];
  }
}

function transformGitHubRepoToDocument(repo: GitHubRepo, rank: number, query: string): any {
  const snippet = repo.description ? 
    repo.description.substring(0, 200) + (repo.description.length > 200 ? '...' : '') :
    'No description available';

  return {
    id: repo.id,
    title: `${repo.name} - ${repo.full_name}`,
    content: `${repo.description || 'No description available'}\n\nRepository: ${repo.full_name}\nLanguage: ${repo.language}\nStars: ${repo.stargazers_count}\nTopics: ${repo.topics.join(', ')}\nCreated: ${repo.created_at}\nLast Updated: ${repo.updated_at}`,
    snippet,
    source: `GitHub Repository`,
    sourceType: 'code',
    url: repo.html_url,
    metadata: {
      stars: repo.stargazers_count,
      language: repo.language,
      topics: repo.topics,
      created_at: repo.created_at,
      updated_at: repo.updated_at
    },
    relevanceScore: Math.max(0.3, 0.7 - (rank * 0.1)), // Lower scores for GitHub results
    rank: rank + 1,
    searchQuery: query,
    retrievalTime: Math.random() * 0.3 + 0.1,
    tokenCount: Math.floor((repo.description?.length || 100) / 4)
  };
}

export async function registerRoutes(app: Express): Promise<Server> {
  // Knowledge graph data endpoint
  app.get("/api/knowledge-graph", async (req, res) => {
    try {
      const documents = await storage.getDocuments(50);
      
      const nodes: any[] = [];
      const links: any[] = [];
      
      // Create document nodes from actual storage
      documents.forEach(doc => {
        nodes.push({
          id: `doc_${doc.id}`,
          label: doc.title.substring(0, 50) + (doc.title.length > 50 ? "..." : ""),
          type: "document",
          size: 12,
          color: "#3b82f6",
          metadata: {
            title: doc.title,
            sourceType: doc.sourceType,
            year: new Date(doc.createdAt).getFullYear(),
            id: doc.id
          }
        });
      });
      
      // Extract concepts from document content
      const conceptMap = new Map<string, number>();
      const conceptToDocuments = new Map<string, number[]>();
      
      documents.forEach(doc => {
        const content = doc.content.toLowerCase();
        const concepts = [
          'ai', 'artificial intelligence', 'machine learning', 'deep learning',
          'neural networks', 'transformer', 'attention', 'embedding', 'vector',
          'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
          'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
          'alignment', 'reasoning', 'language model', 'nlp', 'computer vision'
        ];
        
        concepts.forEach(concept => {
          if (content.includes(concept)) {
            conceptMap.set(concept, (conceptMap.get(concept) || 0) + 1);
            if (!conceptToDocuments.has(concept)) {
              conceptToDocuments.set(concept, []);
            }
            conceptToDocuments.get(concept)!.push(doc.id);
          }
        });
      });
      
      // Create document-to-document connections based on shared concepts
      const documentConnections = new Map<string, Set<number>>();
      
      documents.forEach(doc1 => {
        const doc1Concepts = new Set<string>();
        const content1 = doc1.content.toLowerCase();
        
        // Enhanced concept detection for better connections
        const allConcepts = [
          'ai', 'artificial intelligence', 'machine learning', 'deep learning',
          'neural networks', 'transformer', 'attention', 'embedding', 'vector',
          'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini',
          'multimodal', 'fine-tuning', 'training', 'optimization', 'safety',
          'alignment', 'reasoning', 'language model', 'nlp', 'computer vision',
          'code generation', 'programming', 'software', 'development', 'copilot',
          'constitutional ai', 'rlhf', 'instruction tuning', 'benchmarks',
          'performance', 'efficiency', 'compression', 'quantization', 'edge ai',
          'mamba', 'mixture of experts', 'moe', 'architecture', 'scaling'
        ];
        
        allConcepts.forEach(concept => {
          if (content1.includes(concept)) {
            doc1Concepts.add(concept);
          }
        });
        
        // Find related documents with shared concepts
        documents.forEach(doc2 => {
          if (doc1.id !== doc2.id) {
            const content2 = doc2.content.toLowerCase();
            let sharedConcepts = 0;
            
            doc1Concepts.forEach(concept => {
              if (content2.includes(concept)) {
                sharedConcepts++;
              }
            });
            
            // Create connection if documents share 3+ concepts
            if (sharedConcepts >= 3) {
              const connectionKey = `${Math.min(doc1.id, doc2.id)}_${Math.max(doc1.id, doc2.id)}`;
              if (!documentConnections.has(connectionKey)) {
                documentConnections.set(connectionKey, new Set([doc1.id, doc2.id]));
                
                links.push({
                  source: `doc_${doc1.id}`,
                  target: `doc_${doc2.id}`,
                  relationship: "related_concepts",
                  strength: Math.min(sharedConcepts / 10, 1),
                  color: "#3b82f6"
                });
              }
            }
          }
        });
      });

      // Create concept nodes for concepts that appear in multiple documents
      conceptMap.forEach((count, concept) => {
        if (count >= 2) {
          nodes.push({
            id: `concept_${concept.replace(/\s+/g, '_')}`,
            label: concept,
            type: "concept",
            size: 8 + count * 2,
            color: "#10b981",
            metadata: {
              documentCount: count,
              concept: concept
            }
          });
          
          // Link concept to documents
          const relatedDocs = conceptToDocuments.get(concept) || [];
          relatedDocs.forEach(docId => {
            links.push({
              source: `doc_${docId}`,
              target: `concept_${concept.replace(/\s+/g, '_')}`,
              relationship: "contains_concept",
              strength: 1,
              color: "#10b981"
            });
          });
        }
      });
      
      // Extract research teams from document metadata
      const researchTeams = new Map<string, number[]>();
      documents.forEach(doc => {
        if (doc.metadata) {
          let teamName = '';
          const metadata = typeof doc.metadata === 'string' ? JSON.parse(doc.metadata) : doc.metadata;
          
          // Extract team names from authors or venue
          if (metadata.authors && Array.isArray(metadata.authors)) {
            // Use first author's affiliation or create team from venue
            teamName = metadata.venue || 'Research Team';
          } else if (metadata.venue) {
            teamName = metadata.venue;
          } else if (doc.source) {
            // Extract team from source
            if (doc.source.includes('OpenAI')) teamName = 'OpenAI Research';
            else if (doc.source.includes('Anthropic')) teamName = 'Anthropic';
            else if (doc.source.includes('Google') || doc.source.includes('DeepMind')) teamName = 'Google DeepMind';
            else if (doc.source.includes('LangChain')) teamName = 'LangChain Team';
            else if (doc.source.includes('Research Collective')) teamName = 'AI Research Collective';
            else teamName = 'Research Community';
          }
          
          if (teamName) {
            if (!researchTeams.has(teamName)) {
              researchTeams.set(teamName, []);
            }
            researchTeams.get(teamName)!.push(doc.id);
          }
        }
      });
      
      // Create research team nodes
      researchTeams.forEach((docIds, teamName) => {
        nodes.push({
          id: `team_${teamName.replace(/\s+/g, '_')}`,
          label: teamName,
          type: "author",
          size: 8 + docIds.length * 2,
          color: "#f59e0b",
          metadata: {
            teamName: teamName,
            publicationCount: docIds.length
          }
        });
        
        // Link team to documents
        docIds.forEach(docId => {
          links.push({
            source: `team_${teamName.replace(/\s+/g, '_')}`,
            target: `doc_${docId}`,
            relationship: "authored_by",
            strength: 0.8,
            color: "#f59e0b"
          });
        });
      });

      // Create source type clusters
      const sourceTypes = new Map<string, number[]>();
      documents.forEach(doc => {
        const sourceType = doc.sourceType || 'unknown';
        if (!sourceTypes.has(sourceType)) {
          sourceTypes.set(sourceType, []);
        }
        sourceTypes.get(sourceType)!.push(doc.id);
      });
      
      sourceTypes.forEach((docIds, sourceType) => {
        if (docIds.length >= 2) {
          nodes.push({
            id: `source_${sourceType}`,
            label: sourceType.charAt(0).toUpperCase() + sourceType.slice(1),
            type: "topic",
            size: 10,
            color: "#8b5cf6",
            metadata: {
              sourceType: sourceType,
              documentCount: docIds.length
            }
          });
          
          // Link source type to documents
          docIds.forEach(docId => {
            links.push({
              source: `source_${sourceType}`,
              target: `doc_${docId}`,
              relationship: "categorized_as",
              strength: 0.6,
              color: "#8b5cf6"
            });
          });
        }
      });
      
      res.json({
        nodes,
        links,
        stats: {
          totalDocuments: documents.length,
          totalConcepts: conceptMap.size,
          totalResearchTeams: researchTeams.size,
          totalSourceTypes: sourceTypes.size
        }
      });
      
    } catch (error) {
      console.error("Knowledge graph generation failed:", error);
      res.status(500).json({ 
        error: "Failed to generate knowledge graph",
        nodes: [],
        links: [],
        stats: { totalDocuments: 0, totalConcepts: 0, totalResearchTeams: 0, totalSourceTypes: 0 }
      });
    }
  });

  // Enhanced search with web fallback
  app.post("/api/search", async (req, res) => {
    try {
      const searchRequest = searchRequestSchema.parse(req.body);
      const streaming = req.body.streaming === true;
      const startTime = Date.now();
      
      let allDocuments: any[] = [];
      
      // Enhanced multi-source search for semantic queries
      if (searchRequest.searchType === "semantic") {
        console.log(`🔍 Enhanced multi-source search for: "${searchRequest.query}"`);
        
        // 1. First, always do keyword search on knowledge base
        console.log('📚 Searching knowledge base...');
        
        // Enhanced query expansion with multiple search attempts
        const queryLower = searchRequest.query.toLowerCase();
        const searchQueries = [searchRequest.query]; // Start with original query
        
        // Add related terms for better matching
        if (queryLower.includes('mistral')) {
          searchQueries.push('Mixtral', 'Mistral AI');
        }
        if (queryLower.includes('mixtral')) {
          searchQueries.push('Mistral', 'mixture of experts');
        }
        if (queryLower.includes('llama')) {
          searchQueries.push('LLaMA', 'Large Language Model Meta AI');
        }
        if (queryLower.includes('gpt')) {
          searchQueries.push('GPT', 'Generative Pre-trained Transformer');
        }
        if (queryLower.includes('transformer') || queryLower.includes('attention')) {
          searchQueries.push('Attention Is All You Need', 'transformer', 'attention mechanism');
        }
        if (queryLower.includes('constitutional')) {
          searchQueries.push('Constitutional AI', 'harmlessness', 'AI feedback');
        }
        if (queryLower.includes('rag') || queryLower.includes('retrieval')) {
          searchQueries.push('Retrieval-Augmented Generation', 'retrieval augmented', 'knowledge-intensive');
        }
        
        // Search with each query and combine results
        const allSearchResults = new Map<number, any>();
        
        for (const query of searchQueries) {
          const searchResult = await storage.searchDocuments({ ...searchRequest, query });
          for (const doc of searchResult.results || []) {
            if (!allSearchResults.has(doc.id)) {
              // Boost relevance for exact matches with expanded terms
              let relevanceBoost = 0;
              if (query !== searchRequest.query) {
                relevanceBoost = 0.2; // Boost expanded term matches
              }
              
              allSearchResults.set(doc.id, {
                ...doc,
                relevanceScore: Math.min(doc.relevanceScore + relevanceBoost, 1.0)
              });
            }
          }
        }
        
        allDocuments = Array.from(allSearchResults.values());
        
        allDocuments = allDocuments.map(doc => ({
          ...doc,
          relevanceScore: Math.min(doc.relevanceScore + 0.6, 1.0), // Boost local results
          rank: doc.rank,
          snippet: doc.snippet || doc.content.substring(0, 200) + '...'
        }));
        
        console.log(`📚 Found ${allDocuments.length} local documents`);
        
        console.log(`📚 Query expansion searched for: ${searchQueries.join(', ')}`);
        
        // Skip AI enhancement for now to test query expansion
        // TODO: Re-enable AI enhancement after fixing query expansion
      } else {
        // Use regular keyword search for other search types
        const localResults = await storage.searchDocuments(searchRequest);
        // Boost relevance scores for knowledge base documents to prioritize them
        allDocuments = (localResults.results || []).map(doc => ({
          ...doc,
          relevanceScore: Math.min(doc.relevanceScore + 0.5, 1.0) // Boost by 0.5
        }));
      }
      
      // Validate URLs in local storage results as well
      if (allDocuments.length > 0) {
        console.log('Validating URLs in local storage results...');
        const documentsWithUrls = allDocuments.filter(doc => doc.url);
        
        if (documentsWithUrls.length > 0) {
          const urls = documentsWithUrls.map(doc => doc.url).filter((url): url is string => url !== null);
          const validationResults = await validateUrls(urls);
          
          // Filter out documents with invalid URLs
          allDocuments = allDocuments.filter(doc => {
            if (!doc.url) return true; // Keep documents without URLs
            
            const isValid = validationResults.get(doc.url);
            if (!isValid) {
              console.log(`Filtered out local document with invalid URL: ${doc.url} (${doc.title})`);
            }
            return isValid;
          });
          
          console.log(`Local URL validation completed. ${allDocuments.length} documents have valid URLs.`);
        }
      }
      
      // Always search external sources to provide comprehensive results
      console.log(`🌐 Searching external sources to supplement ${allDocuments.length} local results...`);
      
      // Check if we should search GitHub
      const isCodeQuery = searchRequest.query.toLowerCase().includes('python') || 
                         searchRequest.query.toLowerCase().includes('data structures') ||
                         searchRequest.query.toLowerCase().includes('algorithm') ||
                         searchRequest.query.toLowerCase().includes('repository') ||
                         searchRequest.query.toLowerCase().includes('code') ||
                         searchRequest.query.toLowerCase().includes('programming') ||
                         searchRequest.query.toLowerCase().includes('github');
      
      // Enhanced keyword detection for AI/ML queries that might have relevant code
      const isAIQuery = searchRequest.query.toLowerCase().includes('mistral') ||
                       searchRequest.query.toLowerCase().includes('llama') ||
                       searchRequest.query.toLowerCase().includes('transformer') ||
                       searchRequest.query.toLowerCase().includes('gpt') ||
                       searchRequest.query.toLowerCase().includes('ai') ||
                       searchRequest.query.toLowerCase().includes('machine learning') ||
                       searchRequest.query.toLowerCase().includes('neural network');
      
      // Query analysis for external search triggers
      
      // Enhanced external search with better error handling and timeouts
      const externalSearchPromises = [];
      
      // GitHub search for code and AI-related queries
      if ((isCodeQuery || isAIQuery) && process.env.GITHUB_TOKEN) {
        console.log('🐙 Searching GitHub...');
        externalSearchPromises.push(
          Promise.race([
            searchGitHubRepos(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
              .then(repos => ({
                type: 'github',
                results: repos.map((repo, index) => 
                  transformGitHubRepoToDocument(repo, index + allDocuments.length, searchRequest.query)
                )
              }))
              .catch(error => {
                console.log('🐙 GitHub search failed:', error.message);
                return { type: 'github', results: [] };
              }),
            new Promise((_, reject) => 
              setTimeout(() => reject(new Error('GitHub search timeout')), 8000)
            )
          ]).catch(() => ({ type: 'github', results: [] }))
        );
      }
      
      // Always include web search for comprehensive coverage
      console.log('🌍 Searching web...');
      externalSearchPromises.push(
        Promise.race([
          searchWeb(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3)))
            .then(webResults => ({
              type: 'web',
              results: webResults.map((result, index) => 
                transformWebResultToDocument(result, index + allDocuments.length, searchRequest.query)
              )
            }))
            .catch(error => {
              console.log('🌍 Web search failed:', error.message);
              return { type: 'web', results: [] };
            }),
          new Promise((_, reject) => 
            setTimeout(() => reject(new Error('Web search timeout')), 5000)
          )
        ]).catch(() => ({ type: 'web', results: [] }))
      );
      
      // Wait for external searches with timeout protection
      if (externalSearchPromises.length > 0) {
        try {
          const externalResults = await Promise.all(externalSearchPromises);
          
          // Flatten and combine results
          const githubResult = externalResults.find((r: any) => r?.type === 'github') as any;
          const webResult = externalResults.find((r: any) => r?.type === 'web') as any;
          const githubResults = githubResult?.results || [];
          const webResults = webResult?.results || [];
          const allExternalResults = [...githubResults, ...webResults];
          
          console.log(`🌐 Found ${allExternalResults.length} external results (GitHub: ${githubResults.length}, Web: ${webResults.length})`);
          
          // Combine local and external results, keeping local results prioritized
          if (allExternalResults.length > 0) {
            allDocuments = [...allDocuments, ...allExternalResults]
              .sort((a, b) => b.relevanceScore - a.relevanceScore)
              .slice(0, searchRequest.limit);
          }
        } catch (externalError: any) {
          console.log('🌐 External search failed:', externalError?.message || externalError);
        }
      }
      
      console.log(`✅ Total results: ${allDocuments.length}`);
      
      const searchTime = (Date.now() - startTime) / 1000;
      const response = {
        results: allDocuments,
        totalCount: allDocuments.length,
        searchTime,
        query: searchRequest.query,
        queryId: Date.now()
      };

      res.json(response);
    } catch (error) {
      if (error instanceof z.ZodError) {
        res.status(400).json({ message: "Invalid search request", errors: error.errors });
      } else {
        console.error('Search error:', error);
        res.status(500).json({ message: "Internal server error" });
      }
    }
  });

  // AI explanation endpoint using Nebius
  app.post("/api/explain", async (req, res) => {
    try {
      const { title, snippet, content } = req.body;
      
      if (!title || !snippet) {
        return res.status(400).json({ message: "Title and snippet are required" });
      }

      const prompt = `You are an expert communicator. Explain this document directly in a clear, conversational way suitable for audio playback. Do not show your thinking process - just provide the final explanation.

Title: ${title}
Content: ${snippet}

Provide a brief, engaging explanation (2-3 sentences) that would be pleasant to listen to. Focus on the key concepts and practical value. Start your response immediately with the explanation.`;

      const response = await nebiusClient.createChatCompletion({
        model: "deepseek-ai/DeepSeek-R1-0528", // Using DeepSeek model via Nebius
        messages: [{ role: "user", content: prompt }],
        max_tokens: 150,
        temperature: 0.7,
      });

      const explanation = cleanThinkingTags(response.choices[0].message.content);
      res.json({ explanation });
    } catch (error) {
      console.error('AI explanation error:', error);
      res.status(500).json({ message: "Failed to generate explanation" });
    }
  });

  // Enhanced AI-powered search using Nebius and Modal
  app.post("/api/ai-search", async (req, res) => {
    try {
      const { query, maxResults = 10, useQueryEnhancement = true } = req.body;
      
      if (!query || typeof query !== 'string') {
        return res.status(400).json({ message: "Query is required" });
      }

      const results = await smartIngestionService.enhancedSearch(query, {
        maxResults,
        searchType: 'semantic',
        useQueryEnhancement
      });

      res.json(results);
    } catch (error) {
      console.error('AI search error:', error);
      res.status(500).json({ 
        message: "AI search failed", 
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Document analysis using Nebius AI
  app.post("/api/analyze-document", async (req, res) => {
    try {
      const { content, analysisType = 'summary', useMarkdown = true } = req.body;
      
      if (!content) {
        return res.status(400).json({ message: "Content is required" });
      }

      const analysis = await nebiusClient.analyzeDocument({
        content,
        analysisType,
        useMarkdown
      });

      res.json(analysis);
    } catch (error) {
      console.error('Document analysis error:', error);
      res.status(500).json({ 
        message: "Document analysis failed",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Research synthesis using Nebius AI
  app.post("/api/research-synthesis", async (req, res) => {
    try {
      const { query, documentIds } = req.body;
      
      if (!query || !Array.isArray(documentIds)) {
        return res.status(400).json({ message: "Query and document IDs are required" });
      }

      // Get documents from storage
      const documents = await Promise.all(
        documentIds.map(id => storage.getDocument(id))
      );
      
      const validDocuments = documents.filter(Boolean);
      
      if (validDocuments.length === 0) {
        return res.status(400).json({ message: "No valid documents found" });
      }

      const synthesis = await smartIngestionService.generateResearchSynthesis(
        query, 
        validDocuments
      );

      res.json(synthesis);
    } catch (error) {
      console.error('Research synthesis error:', error);
      res.status(500).json({ 
        message: "Research synthesis failed",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Query enhancement using Nebius AI
  app.post("/api/enhance-query", async (req, res) => {
    try {
      const { query, context } = req.body;
      
      if (!query) {
        return res.status(400).json({ message: "Query is required" });
      }

      const enhancement = await nebiusClient.enhanceQuery(query, context);
      
      // Clean up any thinking tags that might appear in string fields
      enhancement.enhancedQuery = cleanThinkingTags(enhancement.enhancedQuery);
      enhancement.intent = cleanThinkingTags(enhancement.intent);
      
      res.json(enhancement);
    } catch (error) {
      console.error('Query enhancement error:', error);
      res.status(500).json({ 
        message: "Query enhancement failed",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Modal processing status endpoint
  app.get("/api/modal-task/:taskId", async (req, res) => {
    try {
      const { taskId } = req.params;
      const status = await modalClient.getTaskStatus(taskId);
      res.json(status);
    } catch (error) {
      console.error('Modal task status error:', error);
      res.status(500).json({ 
        message: "Failed to get task status",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Batch document ingestion using Modal
  app.post("/api/batch-ingest", async (req, res) => {
    try {
      const { documents } = req.body;
      
      if (!Array.isArray(documents) || documents.length === 0) {
        return res.status(400).json({ message: "Documents array is required" });
      }

      const uploads = documents.map(doc => ({
        file: doc.content || '',
        filename: doc.filename || 'unknown.txt',
        contentType: doc.contentType || 'text/plain',
        metadata: doc.metadata || {}
      }));

      const result = await smartIngestionService.batchIngestDocuments(uploads);
      res.json(result);
    } catch (error) {
      console.error('Batch ingestion error:', error);
      res.status(500).json({ 
        message: "Batch ingestion failed",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // API Health Check endpoint
  app.get("/api/health", async (req, res) => {
    try {
      const { checkAPIHealth } = await import('./api-health-check');
      const healthStatus = await checkAPIHealth();
      
      const overallHealthy = healthStatus.every(status => status.status !== 'error');
      
      res.status(overallHealthy ? 200 : 503).json({
        overall: overallHealthy ? 'healthy' : 'issues_detected',
        services: healthStatus,
        timestamp: new Date().toISOString()
      });
    } catch (error) {
      res.status(500).json({
        overall: 'error',
        message: 'Health check failed',
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Generate embeddings using Nebius
  app.post("/api/embeddings", async (req, res) => {
    try {
      const { input, model = 'text-embedding-ada-002' } = req.body;
      
      if (!input) {
        return res.status(400).json({ message: "Input text is required" });
      }

      console.log('Generating embeddings for input:', input.substring(0, 100) + '...');
      const embeddings = await nebiusClient.createEmbeddings({ input, model });
      console.log('Embeddings generated successfully');
      res.json(embeddings);
    } catch (error) {
      console.error('Embeddings error:', error);
      res.status(500).json({ 
        message: "Embedding generation failed",
        error: error instanceof Error ? error.message : 'Unknown error'
      });
    }
  });

  // Other routes...
  app.get("/api/documents", async (req, res) => {
    try {
      const limit = parseInt(req.query.limit as string) || 50;
      const offset = parseInt(req.query.offset as string) || 0;
      const documents = await storage.getDocuments(limit, offset);
      res.json(documents);
    } catch (error) {
      res.status(500).json({ message: "Failed to fetch documents" });
    }
  });

  // Register document routes - enable uploads by default for all environments
  // Hugging Face Spaces have /tmp storage which is suitable for uploads
  const isHuggingFaceSpace = process.env.SPACE_ID || process.env.HF_SPACE_ID || 
    process.env.HUGGINGFACE_SPACE_ID || process.env.HF_TOKEN || false;
  const hasWritableStorage = process.env.NODE_ENV === 'production' ? 
    fs.existsSync('/tmp') : 
    true; // Development always has writable storage
  
  // Force enable uploads for Hugging Face Spaces, otherwise check DISABLE_UPLOADS
  const isDocumentUploadEnabled = isHuggingFaceSpace ? true : (process.env.DISABLE_UPLOADS !== 'true');
  
  console.log('🔍 Environment check:', {
    NODE_ENV: process.env.NODE_ENV,
    DISABLE_UPLOADS: process.env.DISABLE_UPLOADS,
    isHuggingFaceSpace: !!isHuggingFaceSpace,
    hasWritableStorage,
    isDocumentUploadEnabled
  });
  
  if (isDocumentUploadEnabled) {
    console.log('✅ Document uploads enabled - full functionality available');
    app.use("/api/documents", documentRoutes);
  } else {
    console.log('ℹ️  Document uploads disabled - using fallback routes');
    app.use("/api/documents", uploadFallbackRoutes);
  }

  const httpServer = createServer(app);
  return httpServer;
}