KnowledgeBridge / server /sqlite-storage.ts
fazeel007's picture
Fix Hugging Face Spaces deployment: Handle file permissions and production environment
ccf1a85
import Database from 'better-sqlite3';
import path from 'path';
import fs from 'fs';
import {
type Document,
type InsertDocument,
type SearchQuery,
type InsertSearchQuery,
type SearchResult,
type InsertSearchResult,
type Citation,
type InsertCitation,
type SearchRequest,
type SearchResponse,
type DocumentWithContext
} from "@shared/schema";
import { IStorage } from './storage';
export class SQLiteStorage implements IStorage {
private db: Database.Database;
constructor(dbPath?: string) {
// Use /tmp for database in production environments (like Hugging Face Spaces)
const defaultPath = process.env.NODE_ENV === 'production'
? '/tmp/knowledgebridge.db'
: './data/knowledgebridge.db';
const finalPath = dbPath || defaultPath;
// Ensure data directory exists with error handling
const dir = path.dirname(finalPath);
try {
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
} catch (error) {
console.warn(`Failed to create database directory at ${dir}:`, error);
}
this.db = new Database(finalPath);
this.initializeTables();
}
private initializeTables() {
// Enable foreign keys
this.db.pragma('foreign_keys = ON');
// Create documents table
this.db.exec(`
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
content TEXT NOT NULL,
source TEXT NOT NULL,
source_type TEXT NOT NULL,
url TEXT,
metadata TEXT, -- JSON string
embedding TEXT, -- JSON string
file_path TEXT,
file_name TEXT,
file_size INTEGER,
mime_type TEXT,
processing_status TEXT NOT NULL DEFAULT 'pending',
modal_task_id TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
processed_at DATETIME
)
`);
// Create search_queries table
this.db.exec(`
CREATE TABLE IF NOT EXISTS search_queries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT NOT NULL,
search_type TEXT NOT NULL DEFAULT 'semantic',
filters TEXT, -- JSON string
results_count INTEGER DEFAULT 0,
search_time REAL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
`);
// Create search_results table
this.db.exec(`
CREATE TABLE IF NOT EXISTS search_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query_id INTEGER NOT NULL,
document_id INTEGER NOT NULL,
relevance_score REAL NOT NULL,
snippet TEXT NOT NULL,
rank INTEGER NOT NULL,
FOREIGN KEY (query_id) REFERENCES search_queries(id),
FOREIGN KEY (document_id) REFERENCES documents(id)
)
`);
// Create citations table
this.db.exec(`
CREATE TABLE IF NOT EXISTS citations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER NOT NULL,
citation_text TEXT NOT NULL,
page_number INTEGER,
section TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (document_id) REFERENCES documents(id)
)
`);
// Create indexes for better performance
this.db.exec(`
CREATE INDEX IF NOT EXISTS idx_documents_source_type ON documents(source_type);
CREATE INDEX IF NOT EXISTS idx_documents_processing_status ON documents(processing_status);
CREATE INDEX IF NOT EXISTS idx_search_results_query_id ON search_results(query_id);
CREATE INDEX IF NOT EXISTS idx_search_results_document_id ON search_results(document_id);
CREATE INDEX IF NOT EXISTS idx_citations_document_id ON citations(document_id);
`);
}
async getDocument(id: number): Promise<Document | undefined> {
const stmt = this.db.prepare('SELECT * FROM documents WHERE id = ?');
const row = stmt.get(id) as any;
return row ? this.mapDocumentRow(row) : undefined;
}
async getDocuments(limit = 50, offset = 0): Promise<Document[]> {
const stmt = this.db.prepare('SELECT * FROM documents ORDER BY created_at DESC LIMIT ? OFFSET ?');
const rows = stmt.all(limit, offset) as any[];
return rows.map(row => this.mapDocumentRow(row));
}
async createDocument(insertDocument: InsertDocument): Promise<Document> {
const stmt = this.db.prepare(`
INSERT INTO documents (
title, content, source, source_type, url, metadata, embedding,
file_path, file_name, file_size, mime_type, processing_status, modal_task_id
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const result = stmt.run(
insertDocument.title,
insertDocument.content,
insertDocument.source,
insertDocument.sourceType,
insertDocument.url || null,
insertDocument.metadata ? JSON.stringify(insertDocument.metadata) : null,
insertDocument.embedding || null,
(insertDocument as any).filePath || null,
(insertDocument as any).fileName || null,
(insertDocument as any).fileSize || null,
(insertDocument as any).mimeType || null,
(insertDocument as any).processingStatus || 'pending',
(insertDocument as any).modalTaskId || null
);
const created = await this.getDocument(result.lastInsertRowid as number);
if (!created) throw new Error('Failed to create document');
return created;
}
async updateDocument(id: number, updateData: Partial<InsertDocument & { processingStatus?: string; modalTaskId?: string; processedAt?: Date }>): Promise<Document | undefined> {
const existing = await this.getDocument(id);
if (!existing) return undefined;
const fields: string[] = [];
const values: any[] = [];
Object.entries(updateData).forEach(([key, value]) => {
if (value !== undefined) {
switch (key) {
case 'sourceType':
fields.push('source_type = ?');
break;
case 'processingStatus':
fields.push('processing_status = ?');
break;
case 'modalTaskId':
fields.push('modal_task_id = ?');
break;
case 'filePath':
fields.push('file_path = ?');
break;
case 'fileName':
fields.push('file_name = ?');
break;
case 'fileSize':
fields.push('file_size = ?');
break;
case 'mimeType':
fields.push('mime_type = ?');
break;
case 'processedAt':
fields.push('processed_at = ?');
value = value instanceof Date ? value.toISOString() : value;
break;
case 'metadata':
fields.push('metadata = ?');
value = value ? JSON.stringify(value) : null;
break;
default:
fields.push(`${key} = ?`);
}
values.push(value);
}
});
if (fields.length === 0) return existing;
values.push(id);
const stmt = this.db.prepare(`UPDATE documents SET ${fields.join(', ')} WHERE id = ?`);
stmt.run(...values);
return await this.getDocument(id);
}
async deleteDocument(id: number): Promise<boolean> {
const stmt = this.db.prepare('DELETE FROM documents WHERE id = ?');
const result = stmt.run(id);
return result.changes > 0;
}
async searchDocuments(request: SearchRequest): Promise<SearchResponse> {
const startTime = Date.now();
let sql = `
SELECT * FROM documents
WHERE (title LIKE ? OR content LIKE ?)
`;
const params: any[] = [`%${request.query}%`, `%${request.query}%`];
// Add source type filter if specified
if (request.filters?.sourceTypes?.length) {
const placeholders = request.filters.sourceTypes.map(() => '?').join(',');
sql += ` AND source_type IN (${placeholders})`;
params.push(...request.filters.sourceTypes);
}
sql += ` ORDER BY
CASE
WHEN title LIKE ? THEN 1
WHEN content LIKE ? THEN 2
ELSE 3
END,
created_at DESC
LIMIT ? OFFSET ?
`;
params.push(`%${request.query}%`, `%${request.query}%`, request.limit, request.offset);
const stmt = this.db.prepare(sql);
const rows = stmt.all(...params) as any[];
const results = rows.map((row, index) => {
const doc = this.mapDocumentRow(row);
return {
...doc,
relevanceScore: this.calculateRelevanceScore(doc, request.query),
snippet: this.extractSnippet(doc.content, request.query),
rank: index + 1
};
});
const searchTime = (Date.now() - startTime) / 1000;
// Save search query
const searchQuery = await this.createSearchQuery({
query: request.query,
searchType: request.searchType,
filters: request.filters,
resultsCount: results.length,
searchTime
});
// Save search results
for (const doc of results) {
await this.createSearchResult({
queryId: searchQuery.id,
documentId: doc.id,
relevanceScore: doc.relevanceScore,
snippet: doc.snippet,
rank: doc.rank
});
}
return {
results,
totalCount: results.length,
searchTime,
query: request.query,
queryId: searchQuery.id
};
}
private calculateRelevanceScore(doc: Document, query: string): number {
const queryLower = query.toLowerCase();
const titleLower = doc.title.toLowerCase();
const contentLower = doc.content.toLowerCase();
let score = 0;
// Exact title match gets highest score
if (titleLower === queryLower) score += 1.0;
else if (titleLower.includes(queryLower)) score += 0.8;
// Content matches
if (contentLower.includes(queryLower)) score += 0.3;
// Word-by-word scoring
const queryWords = queryLower.split(' ');
queryWords.forEach(word => {
if (titleLower.includes(word)) score += 0.2;
if (contentLower.includes(word)) score += 0.1;
});
return Math.min(score, 1.0);
}
private extractSnippet(content: string, query: string, maxLength = 200): string {
const queryLower = query.toLowerCase();
const contentLower = content.toLowerCase();
const index = contentLower.indexOf(queryLower);
if (index === -1) {
return content.substring(0, maxLength) + (content.length > maxLength ? '...' : '');
}
const start = Math.max(0, index - 50);
const end = Math.min(content.length, index + queryLower.length + 150);
let snippet = content.substring(start, end);
if (start > 0) snippet = '...' + snippet;
if (end < content.length) snippet = snippet + '...';
return snippet;
}
async getDocumentsBySourceType(sourceType: string): Promise<Document[]> {
const stmt = this.db.prepare('SELECT * FROM documents WHERE source_type = ? ORDER BY created_at DESC');
const rows = stmt.all(sourceType) as any[];
return rows.map(row => this.mapDocumentRow(row));
}
async getDocumentsByProcessingStatus(status: string): Promise<Document[]> {
const stmt = this.db.prepare('SELECT * FROM documents WHERE processing_status = ? ORDER BY created_at DESC');
const rows = stmt.all(status) as any[];
return rows.map(row => this.mapDocumentRow(row));
}
async createSearchQuery(insertQuery: InsertSearchQuery): Promise<SearchQuery> {
const stmt = this.db.prepare(`
INSERT INTO search_queries (query, search_type, filters, results_count, search_time)
VALUES (?, ?, ?, ?, ?)
`);
const result = stmt.run(
insertQuery.query,
insertQuery.searchType || 'semantic',
insertQuery.filters ? JSON.stringify(insertQuery.filters) : null,
insertQuery.resultsCount || null,
insertQuery.searchTime || null
);
const created = this.db.prepare('SELECT * FROM search_queries WHERE id = ?').get(result.lastInsertRowid) as any;
return this.mapSearchQueryRow(created);
}
async getSearchQueries(limit = 50): Promise<SearchQuery[]> {
const stmt = this.db.prepare('SELECT * FROM search_queries ORDER BY created_at DESC LIMIT ?');
const rows = stmt.all(limit) as any[];
return rows.map(row => this.mapSearchQueryRow(row));
}
async createSearchResult(insertResult: InsertSearchResult): Promise<SearchResult> {
const stmt = this.db.prepare(`
INSERT INTO search_results (query_id, document_id, relevance_score, snippet, rank)
VALUES (?, ?, ?, ?, ?)
`);
const result = stmt.run(
insertResult.queryId,
insertResult.documentId,
insertResult.relevanceScore,
insertResult.snippet,
insertResult.rank
);
const created = this.db.prepare('SELECT * FROM search_results WHERE id = ?').get(result.lastInsertRowid) as any;
return this.mapSearchResultRow(created);
}
async getSearchResults(queryId: number): Promise<SearchResult[]> {
const stmt = this.db.prepare('SELECT * FROM search_results WHERE query_id = ? ORDER BY rank');
const rows = stmt.all(queryId) as any[];
return rows.map(row => this.mapSearchResultRow(row));
}
async createCitation(insertCitation: InsertCitation): Promise<Citation> {
const stmt = this.db.prepare(`
INSERT INTO citations (document_id, citation_text, page_number, section)
VALUES (?, ?, ?, ?)
`);
const result = stmt.run(
insertCitation.documentId,
insertCitation.citationText,
insertCitation.pageNumber || null,
insertCitation.section || null
);
const created = this.db.prepare('SELECT * FROM citations WHERE id = ?').get(result.lastInsertRowid) as any;
return this.mapCitationRow(created);
}
async getCitationsByDocument(documentId: number): Promise<Citation[]> {
const stmt = this.db.prepare('SELECT * FROM citations WHERE document_id = ? ORDER BY created_at DESC');
const rows = stmt.all(documentId) as any[];
return rows.map(row => this.mapCitationRow(row));
}
async deleteCitation(id: number): Promise<boolean> {
const stmt = this.db.prepare('DELETE FROM citations WHERE id = ?');
const result = stmt.run(id);
return result.changes > 0;
}
private mapDocumentRow(row: any): Document {
return {
id: row.id,
title: row.title,
content: row.content,
source: row.source,
sourceType: row.source_type,
url: row.url,
metadata: row.metadata ? JSON.parse(row.metadata) : null,
embedding: row.embedding,
createdAt: new Date(row.created_at),
filePath: row.file_path,
fileName: row.file_name,
fileSize: row.file_size,
mimeType: row.mime_type,
processingStatus: row.processing_status,
modalTaskId: row.modal_task_id,
processedAt: row.processed_at ? new Date(row.processed_at) : null,
} as Document;
}
private mapSearchQueryRow(row: any): SearchQuery {
return {
id: row.id,
query: row.query,
searchType: row.search_type,
filters: row.filters ? JSON.parse(row.filters) : null,
resultsCount: row.results_count,
searchTime: row.search_time,
createdAt: new Date(row.created_at)
};
}
private mapSearchResultRow(row: any): SearchResult {
return {
id: row.id,
queryId: row.query_id,
documentId: row.document_id,
relevanceScore: row.relevance_score,
snippet: row.snippet,
rank: row.rank
};
}
private mapCitationRow(row: any): Citation {
return {
id: row.id,
documentId: row.document_id,
citationText: row.citation_text,
pageNumber: row.page_number,
section: row.section,
createdAt: new Date(row.created_at)
};
}
close() {
this.db.close();
}
}