Spaces:
Runtime error
Runtime error
File size: 4,921 Bytes
9581133 7e775c9 9581133 97c0482 9581133 4257f85 9581133 a9adb77 9581133 4257f85 9581133 4257f85 9581133 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from html import escape
import os
import shutil
import time
from datetime import datetime, timedelta
import bleach
import requests
from flask import Flask, request, jsonify, render_template
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, BOOLEAN
from whoosh.qparser import QueryParser
app = Flask(__name__)
# Configure index directory and schema
BASE_INDEX_DIR = "discussion_indices"
CACHE_DURATION = timedelta(hours=24)
schema = Schema(
discussion_id=ID(stored=True),
title=TEXT(stored=True),
content=TEXT(stored=True),
author=TEXT(stored=True),
is_pr=BOOLEAN(stored=True),
is_open=BOOLEAN(stored=True)
)
def get_repo_index_dir(repo_name):
# Convert repo name to safe directory name
safe_name = repo_name.replace('/', '_')
return os.path.join(BASE_INDEX_DIR, safe_name)
def get_repo_last_indexed_file(repo_name):
return os.path.join(get_repo_index_dir(repo_name), 'last_indexed.txt')
def needs_reindex(repo_name):
last_indexed_file = get_repo_last_indexed_file(repo_name)
if not os.path.exists(last_indexed_file):
return True
with open(last_indexed_file, 'r') as f:
last_indexed = datetime.fromtimestamp(float(f.read().strip()))
return datetime.now() - last_indexed > CACHE_DURATION
def index_discussions(repo_name):
index_dir = get_repo_index_dir(repo_name)
# Clear and recreate index directory
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir, exist_ok=True)
# Create index
ix = create_in(index_dir, schema)
writer = ix.writer()
# Fetch and index discussions
discussions = requests.get(f'https://huggingface.co/api/{repo_name}/discussions').json()
for discussion in discussions['discussions']:
comments = requests.get(
f'https://huggingface.co/api/{repo_name}/discussions/{discussion["num"]}'
).json()
# Combine all comments into one content string
content = []
for comment in comments['events']:
try:
if comment['type'] == 'comment':
authorname = comment['author']['name'] if 'author' in comment else 'deleted'
content.append(f'{authorname}: {comment["data"]["latest"]["raw"]}')
except KeyError:
print('Error in comment:')
print(comment)
writer.add_document(
discussion_id=str(discussion["num"]),
title=discussion["title"],
content=f"Title: {discussion['title']}\n\n" + '\n'.join(content),
author=discussion["author"]["name"] if "author" in discussion else 'deleted',
is_pr=discussion["isPullRequest"],
is_open=discussion["status"] == "open"
)
writer.commit()
# Update last indexed timestamp
with open(get_repo_last_indexed_file(repo_name), 'w') as f:
f.write(str(time.time()))
@app.route('/')
def index():
repo_name = request.args.get('repo')
query = request.args.get('query')
if not repo_name:
return render_template('no_repo.html')
if not repo_name.startswith('spaces/') and not repo_name.startswith('datasets/') and not repo_name.startswith('models/'):
repo_name = f'models/{repo_name}'
repo_name = '/'.join(repo_name.split('/')[:3])
return render_template('index.html', repo_name=repo_name, query=query)
@app.route('/search', methods=['POST'])
def search():
data = request.json
query = data.get('query')
repo_name = data.get('repo')
if not repo_name:
return jsonify({'error': 'No repository provided'}), 400
if not query:
return jsonify({'error': 'No query provided'}), 400
# Check if we need to reindex
if needs_reindex(repo_name):
index_discussions(repo_name)
# Search the index
ix = open_dir(get_repo_index_dir(repo_name))
with ix.searcher() as searcher:
query_parser = QueryParser("content", ix.schema)
q = query_parser.parse(query)
results = searcher.search(q)
if repo_name.startswith('models/'):
url_repo_name = repo_name[7:]
else:
url_repo_name = repo_name
# Format results
formatted_results = [{
'discussion_id': escape(result['discussion_id']),
'url': f'https://huggingface.co/{url_repo_name}/discussions/{result["discussion_id"]}',
'title': escape(result['title']),
'author': escape(result['author']),
'excerpt': bleach.clean(result.highlights("content"), tags=['b'], strip=True),
'is_pr': result['is_pr'],
'is_open': result['is_open']
} for result in results]
return jsonify({'results': formatted_results})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860) |