Spaces:
Build error
Build error
streamlit init
Browse files- app.py +5 -7
- arxiv_public_data/__pycache__/__init__.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/config.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fulltext.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc +0 -0
- arxiv_public_data/config.py +1 -1
- pyproject.toml +6 -0
- src/Surveyor.py +18 -12
- src/__pycache__/Surveyor.cpython-310.pyc +0 -0
- src/__pycache__/defaults.cpython-310.pyc +0 -0
- src/defaults.py +19 -1
app.py
CHANGED
|
@@ -27,7 +27,7 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
|
|
| 27 |
st.write(line)
|
| 28 |
|
| 29 |
|
| 30 |
-
def survey_space():
|
| 31 |
|
| 32 |
st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
|
| 33 |
|
|
@@ -41,12 +41,10 @@ def survey_space():
|
|
| 41 |
|
| 42 |
if submit:
|
| 43 |
st.write("hello")
|
| 44 |
-
|
| 45 |
-
surveyor_obj = Surveyor()
|
| 46 |
-
run_survey(surveyor_obj, research_keywords, max_search, num_papers)
|
| 47 |
|
| 48 |
|
| 49 |
if __name__ == '__main__':
|
| 50 |
-
global
|
| 51 |
-
surveyor_obj =
|
| 52 |
-
survey_space()
|
|
|
|
| 27 |
st.write(line)
|
| 28 |
|
| 29 |
|
| 30 |
+
def survey_space(surveyor):
|
| 31 |
|
| 32 |
st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
|
| 33 |
|
|
|
|
| 41 |
|
| 42 |
if submit:
|
| 43 |
st.write("hello")
|
| 44 |
+
run_survey(surveyor, research_keywords, max_search, num_papers)
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
if __name__ == '__main__':
|
| 48 |
+
global surveyor
|
| 49 |
+
surveyor_obj = Surveyor()
|
| 50 |
+
survey_space(surveyor_obj)
|
arxiv_public_data/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (148 Bytes)
|
|
|
arxiv_public_data/__pycache__/config.cpython-310.pyc
DELETED
|
Binary file (1.44 kB)
|
|
|
arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc
DELETED
|
Binary file (2.46 kB)
|
|
|
arxiv_public_data/__pycache__/fulltext.cpython-310.pyc
DELETED
|
Binary file (8.32 kB)
|
|
|
arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc
DELETED
|
Binary file (4.27 kB)
|
|
|
arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc
DELETED
|
Binary file (1.73 kB)
|
|
|
arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc
DELETED
|
Binary file (4.4 kB)
|
|
|
arxiv_public_data/config.py
CHANGED
|
@@ -9,7 +9,7 @@ logging.basicConfig(
|
|
| 9 |
baselog = logging.getLogger('arxivdata')
|
| 10 |
logger = baselog.getChild('config')
|
| 11 |
|
| 12 |
-
DEFAULT_PATH = os.path.join(os.path.abspath('
|
| 13 |
JSONFILE = './config.json'
|
| 14 |
KEY = 'ARXIV_DATA'
|
| 15 |
|
|
|
|
| 9 |
baselog = logging.getLogger('arxivdata')
|
| 10 |
logger = baselog.getChild('config')
|
| 11 |
|
| 12 |
+
DEFAULT_PATH = os.path.join(os.path.abspath('../'), 'arxiv-data')
|
| 13 |
JSONFILE = './config.json'
|
| 14 |
KEY = 'ARXIV_DATA'
|
| 15 |
|
pyproject.toml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = [
|
| 3 |
+
"setuptools>=42",
|
| 4 |
+
"wheel"
|
| 5 |
+
]
|
| 6 |
+
build-backend = "setuptools.build_meta"
|
src/Surveyor.py
CHANGED
|
@@ -16,7 +16,7 @@ except:
|
|
| 16 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
|
| 17 |
LEDForConditionalGeneration
|
| 18 |
|
| 19 |
-
from src.defaults import
|
| 20 |
|
| 21 |
|
| 22 |
class Surveyor:
|
|
@@ -70,18 +70,20 @@ class Surveyor:
|
|
| 70 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
| 71 |
|
| 72 |
'''
|
| 73 |
-
self.torch_device = '
|
| 74 |
print("\nTorch_device: " + self.torch_device)
|
| 75 |
-
if
|
| 76 |
-
print("\nloading
|
|
|
|
| 77 |
spacy.require_gpu()
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
if not kw_model_name:
|
| 80 |
kw_model_name = DEFAULTS["kw_model_name"]
|
| 81 |
-
if not high_gpu:
|
| 82 |
-
self.high_gpu = DEFAULTS["high_gpu"]
|
| 83 |
-
else:
|
| 84 |
-
self.high_gpu = high_gpu
|
| 85 |
self.num_papers = DEFAULTS['num_papers']
|
| 86 |
self.max_search = DEFAULTS['max_search']
|
| 87 |
if not models_dir:
|
|
@@ -110,8 +112,8 @@ class Surveyor:
|
|
| 110 |
if not no_save_models:
|
| 111 |
self.clean_dirs([models_dir])
|
| 112 |
|
| 113 |
-
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
| 114 |
-
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name).to(self.torch_device)
|
| 115 |
self.title_model.eval()
|
| 116 |
if not no_save_models:
|
| 117 |
self.title_model.save_pretrained(models_dir + "/title_model")
|
|
@@ -142,7 +144,7 @@ class Surveyor:
|
|
| 142 |
self.embedder.save(models_dir + "/embedder")
|
| 143 |
else:
|
| 144 |
print("\nInitializing from previously saved models at" + models_dir)
|
| 145 |
-
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
| 146 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
| 147 |
self.title_model.eval()
|
| 148 |
|
|
@@ -615,7 +617,11 @@ class Surveyor:
|
|
| 615 |
paper_body = ""
|
| 616 |
for k, v in research_sections.items():
|
| 617 |
paper_body += v
|
| 618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
def build_corpus_sectionwise(self, papers):
|
| 621 |
known = ['abstract', 'introduction', 'conclusion']
|
|
|
|
| 16 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
|
| 17 |
LEDForConditionalGeneration
|
| 18 |
|
| 19 |
+
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
| 20 |
|
| 21 |
|
| 22 |
class Surveyor:
|
|
|
|
| 70 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
| 71 |
|
| 72 |
'''
|
| 73 |
+
self.torch_device = 'cpu'
|
| 74 |
print("\nTorch_device: " + self.torch_device)
|
| 75 |
+
if torch.cuda.is_available():
|
| 76 |
+
print("\nloading defaults for gpu")
|
| 77 |
+
self.torch_device = 'cuda'
|
| 78 |
spacy.require_gpu()
|
| 79 |
|
| 80 |
+
self.high_gpu = high_gpu
|
| 81 |
+
DEFAULTS = DEFAULTS_CPU_COMPAT
|
| 82 |
+
if self.high_gpu:
|
| 83 |
+
DEFAULTS = DEFAULTS_HIGH_GPU
|
| 84 |
+
|
| 85 |
if not kw_model_name:
|
| 86 |
kw_model_name = DEFAULTS["kw_model_name"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
self.num_papers = DEFAULTS['num_papers']
|
| 88 |
self.max_search = DEFAULTS['max_search']
|
| 89 |
if not models_dir:
|
|
|
|
| 112 |
if not no_save_models:
|
| 113 |
self.clean_dirs([models_dir])
|
| 114 |
|
| 115 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name, trust_remote_code=True)
|
| 116 |
+
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name, trust_remote_code=True).to(self.torch_device)
|
| 117 |
self.title_model.eval()
|
| 118 |
if not no_save_models:
|
| 119 |
self.title_model.save_pretrained(models_dir + "/title_model")
|
|
|
|
| 144 |
self.embedder.save(models_dir + "/embedder")
|
| 145 |
else:
|
| 146 |
print("\nInitializing from previously saved models at" + models_dir)
|
| 147 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name).to(self.torch_device)
|
| 148 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
| 149 |
self.title_model.eval()
|
| 150 |
|
|
|
|
| 617 |
paper_body = ""
|
| 618 |
for k, v in research_sections.items():
|
| 619 |
paper_body += v
|
| 620 |
+
|
| 621 |
+
try:
|
| 622 |
+
return self.abstractive_summary(paper_body)
|
| 623 |
+
except:
|
| 624 |
+
return self.abstractive_summary(self.extractive_summary(paper_body))
|
| 625 |
|
| 626 |
def build_corpus_sectionwise(self, papers):
|
| 627 |
known = ['abstract', 'introduction', 'conclusion']
|
src/__pycache__/Surveyor.cpython-310.pyc
DELETED
|
Binary file (47.8 kB)
|
|
|
src/__pycache__/defaults.cpython-310.pyc
DELETED
|
Binary file (835 Bytes)
|
|
|
src/defaults.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# defaults for arxiv
|
| 2 |
-
|
| 3 |
"max_search": 100,
|
| 4 |
"num_papers": 20,
|
| 5 |
"high_gpu": False,
|
|
@@ -16,5 +16,23 @@ DEFAULTS = {
|
|
| 16 |
"nlp_name": "en_core_sci_scibert",
|
| 17 |
"similarity_nlp_name": "en_core_sci_lg",
|
| 18 |
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
|
|
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
}
|
|
|
|
| 1 |
# defaults for arxiv
|
| 2 |
+
DEFAULTS_HIGH_GPU = {
|
| 3 |
"max_search": 100,
|
| 4 |
"num_papers": 20,
|
| 5 |
"high_gpu": False,
|
|
|
|
| 16 |
"nlp_name": "en_core_sci_scibert",
|
| 17 |
"similarity_nlp_name": "en_core_sci_lg",
|
| 18 |
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
| 19 |
+
}
|
| 20 |
|
| 21 |
+
DEFAULTS_CPU_COMPAT = {
|
| 22 |
+
"max_search": 100,
|
| 23 |
+
"num_papers": 20,
|
| 24 |
+
"high_gpu": False,
|
| 25 |
+
"pdf_dir": "arxiv_data/tarpdfs/",
|
| 26 |
+
"txt_dir": "arxiv_data/fulltext/",
|
| 27 |
+
"img_dir": "arxiv_data/images/",
|
| 28 |
+
"tab_dir": "arxiv_data/tables/",
|
| 29 |
+
"dump_dir": "arxiv_dumps/",
|
| 30 |
+
"models_dir": "saved_models/",
|
| 31 |
+
"title_model_name": "ccdv/lsg-bart-base-4096-arxiv",
|
| 32 |
+
"ex_summ_model_name": "allenai/scibert_scivocab_uncased",
|
| 33 |
+
"ledmodel_name": "bhuvaneswari/t5-small-text_summarization",
|
| 34 |
+
"embedder_name": "paraphrase-MiniLM-L6-v2",
|
| 35 |
+
"nlp_name": "en_core_sci_scibert",
|
| 36 |
+
"similarity_nlp_name": "en_core_sci_lg",
|
| 37 |
+
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
| 38 |
}
|