Spaces:
Runtime error
Runtime error
File size: 4,815 Bytes
0506cec 1726149 ba51acd 1726149 630bd17 ec8a358 1726149 819830c 1726149 0506cec 1726149 630bd17 1726149 ad7dde5 1726149 a0d9318 1726149 0506cec 1726149 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import logging
import csv
import shutil
import nltk
import pandas as pd
from tqdm import tqdm
import gradio as gr
from datasets import Dataset
from transformers import pipeline
from huggingface_hub import HfApi
# ---------------------- Logging Setup ----------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler()]
)
# ---------------------- NLTK Setup ----------------------
def download_nltk():
nltk.download("all")
# nltk.download("punkt")
logging.info("NLTK resources downloaded.")
download_nltk()
# ---------------------- Data Preparation ----------------------
def get_all_words():
from nltk.corpus import words as nltk_words
all_words = nltk_words.words()
logging.info(f"Got {len(all_words)} words from NLTK.")
return all_words
def generate_meaning(word, generator):
prompt = f"Define the word '{word}' in one concise sentence."
try:
result = generator(prompt, max_length=50)[0]["generated_text"]
return result.strip()
except Exception as e:
logging.error(f"Error generating meaning for '{word}': {e}")
return ""
def process_words(model_name, limit=None):
logging.info("Initializing Hugging Face text2text-generation pipeline...")
generator = pipeline("text2text-generation", model=model_name, device=-1)
words_list = get_all_words()
if limit:
words_list = words_list[:limit]
data = []
for word in tqdm(words_list, desc="Processing words"):
tokens = nltk.word_tokenize(word)
meaning = generate_meaning(word, generator)
data.append({
"tokenizer": tokens,
"words": word,
"meaning": meaning
})
logging.info("Finished processing words.")
return data
def save_to_csv(data, filename="output.csv"):
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
logging.info(f"Saved CSV to {filename}.")
return filename
# ---------------------- Push to Hugging Face ----------------------
def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
repo_local_dir = "."
if not os.path.exists(repo_local_dir):
os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
logging.info("Repository cloned locally.")
shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
current_dir = os.getcwd()
os.chdir(repo_local_dir)
os.system("git add .")
os.system('git commit -m "Update dataset"')
os.system("git push")
os.chdir(current_dir)
logging.info("Pushed dataset to Hugging Face repository.")
def generate_all(model_name, word_limit):
try:
word_limit = int(word_limit)
except Exception:
word_limit = None
data = process_words(model_name, limit=word_limit)
csv_file = save_to_csv(data)
push_dataset(csv_file)
return csv_file
# ---------------------- Gradio Interface Functions ----------------------
def run_generate(model_name, word_limit):
output_csv = generate_all(model_name, word_limit)
return f"Generated and pushed CSV: {output_csv}"
def about_tab_content():
about_text = (
"## DeepFocus-X3 Dataset Generator\n\n"
"This tool downloads all available words from the NLTK corpus, "
"generates concise meanings using a Hugging Face text-to-text generation model, "
"and converts the data into a CSV file. Finally, it pushes the CSV to the "
"[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
)
return about_text
def settings_tab_content():
settings_text = (
"**Current Settings**\n\n"
"- Model: `google/flan-t5-xl`\n"
"- Word Limit: 50 (set to empty to process all words)\n"
"\nYou can update these settings in the Generate tab."
)
return settings_text
# ---------------------- Gradio App ----------------------
with gr.Blocks() as demo:
gr.Markdown("## DeepFocus-X3 Dataset Generator")
with gr.Tabs():
# About Tab
with gr.Tab("About"):
gr.Markdown(about_tab_content())
# Generate All Tab
with gr.Tab("Generate all"):
model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
generate_button = gr.Button("Generate and Push Dataset")
generate_output = gr.Textbox(label="Output")
generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)
# Settings Tab
with gr.Tab("Settings"):
gr.Markdown(settings_tab_content())
demo.launch() |