File size: 4,921 Bytes
9581133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e775c9
 
 
 
 
 
 
9581133
 
 
 
97c0482
9581133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4257f85
 
 
9581133
 
 
 
 
 
 
 
 
a9adb77
9581133
 
 
 
 
 
 
 
 
 
 
 
 
4257f85
 
 
 
9581133
 
 
4257f85
9581133
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from html import escape
import os
import shutil
import time
from datetime import datetime, timedelta
import bleach
import requests
from flask import Flask, request, jsonify, render_template
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, BOOLEAN
from whoosh.qparser import QueryParser

app = Flask(__name__)

# Configure index directory and schema
BASE_INDEX_DIR = "discussion_indices"
CACHE_DURATION = timedelta(hours=24)

schema = Schema(
    discussion_id=ID(stored=True),
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    author=TEXT(stored=True),
    is_pr=BOOLEAN(stored=True),
    is_open=BOOLEAN(stored=True)
)

def get_repo_index_dir(repo_name):
    # Convert repo name to safe directory name
    safe_name = repo_name.replace('/', '_')
    return os.path.join(BASE_INDEX_DIR, safe_name)

def get_repo_last_indexed_file(repo_name):
    return os.path.join(get_repo_index_dir(repo_name), 'last_indexed.txt')

def needs_reindex(repo_name):
    last_indexed_file = get_repo_last_indexed_file(repo_name)
    if not os.path.exists(last_indexed_file):
        return True
    
    with open(last_indexed_file, 'r') as f:
        last_indexed = datetime.fromtimestamp(float(f.read().strip()))
    
    return datetime.now() - last_indexed > CACHE_DURATION

def index_discussions(repo_name):
    index_dir = get_repo_index_dir(repo_name)
    
    # Clear and recreate index directory
    if os.path.exists(index_dir):
        shutil.rmtree(index_dir)
    os.makedirs(index_dir, exist_ok=True)
    
    # Create index
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    
    # Fetch and index discussions
    discussions = requests.get(f'https://huggingface.co/api/{repo_name}/discussions').json()
    
    for discussion in discussions['discussions']:
        comments = requests.get(
            f'https://huggingface.co/api/{repo_name}/discussions/{discussion["num"]}'
        ).json()
        
        # Combine all comments into one content string
        content = []
        for comment in comments['events']:
            try:
                if comment['type'] == 'comment':
                    authorname = comment['author']['name'] if 'author' in comment else 'deleted'
                    content.append(f'{authorname}: {comment["data"]["latest"]["raw"]}')
            except KeyError:
                print('Error in comment:')
                print(comment)
        writer.add_document(
            discussion_id=str(discussion["num"]),
            title=discussion["title"],
            content=f"Title: {discussion['title']}\n\n" + '\n'.join(content),
            author=discussion["author"]["name"] if "author" in discussion else 'deleted',
            is_pr=discussion["isPullRequest"],
            is_open=discussion["status"] == "open"
        )
    
    writer.commit()
    
    # Update last indexed timestamp
    with open(get_repo_last_indexed_file(repo_name), 'w') as f:
        f.write(str(time.time()))

@app.route('/')
def index():
    repo_name = request.args.get('repo')
    query = request.args.get('query')
    if not repo_name:
        return render_template('no_repo.html')
    if not repo_name.startswith('spaces/') and not repo_name.startswith('datasets/') and not repo_name.startswith('models/'):
        repo_name = f'models/{repo_name}'
    repo_name = '/'.join(repo_name.split('/')[:3])
    return render_template('index.html', repo_name=repo_name, query=query)

@app.route('/search', methods=['POST'])
def search():
    data = request.json
    query = data.get('query')
    repo_name = data.get('repo')
    if not repo_name:
        return jsonify({'error': 'No repository provided'}), 400

    if not query:
        return jsonify({'error': 'No query provided'}), 400
        
    # Check if we need to reindex
    if needs_reindex(repo_name):
        index_discussions(repo_name)
    
    # Search the index
    ix = open_dir(get_repo_index_dir(repo_name))
    with ix.searcher() as searcher:
        query_parser = QueryParser("content", ix.schema)
        q = query_parser.parse(query)
        results = searcher.search(q)
        if repo_name.startswith('models/'):
            url_repo_name = repo_name[7:]
        else:
            url_repo_name = repo_name
        # Format results
        formatted_results = [{
            'discussion_id': escape(result['discussion_id']),
            'url': f'https://huggingface.co/{url_repo_name}/discussions/{result["discussion_id"]}',
            'title': escape(result['title']), 
            'author': escape(result['author']),
            'excerpt': bleach.clean(result.highlights("content"), tags=['b'], strip=True),
            'is_pr': result['is_pr'],
            'is_open': result['is_open']
        } for result in results]
        
        return jsonify({'results': formatted_results})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)