Spaces:
Sleeping
Sleeping
dataset model csv ingles
Browse files
app.py
CHANGED
@@ -477,27 +477,22 @@ class ModeloDataset:
|
|
477 |
print('idioma:',idioma)
|
478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
480 |
-
|
481 |
-
|
482 |
-
inputs = self.tokenizer(_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
483 |
-
print('inputs')
|
484 |
-
#model.eval()
|
485 |
-
|
486 |
with torch.no_grad():
|
487 |
outputs = model(**inputs)
|
488 |
-
|
489 |
-
|
490 |
logits = outputs.logits
|
491 |
-
print('logits',logits)
|
492 |
predictions = torch.argmax(logits, dim=2)
|
493 |
-
|
494 |
id2label = model.config.id2label
|
495 |
|
496 |
|
497 |
all_tokens = []
|
498 |
all_label_ids = []
|
499 |
all_labels = []
|
500 |
-
for i, sentence in enumerate(
|
501 |
|
502 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
|
503 |
label_ids = predictions[i].tolist()
|
@@ -537,11 +532,12 @@ class ModeloDataset:
|
|
537 |
i=i+1
|
538 |
print('new_tokens')
|
539 |
print(new_tokens[1])
|
540 |
-
print(all_tokens[1])
|
541 |
|
542 |
print(len(new_tokens[1]))
|
543 |
print(len(new_identificadores[1]))
|
544 |
-
|
|
|
545 |
|
546 |
return new_identificadores, new_tokens
|
547 |
|
|
|
477 |
print('idioma:',idioma)
|
478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
480 |
+
sentences_list = _sentences.apply(lambda x: x[0].tolist() if isinstance(x, np.ndarray) else x.tolist())
|
481 |
+
inputs = self.tokenizer(list(sentences_list), padding=True, truncation=True, return_tensors="pt", max_length=512)
|
|
|
|
|
|
|
|
|
482 |
with torch.no_grad():
|
483 |
outputs = model(**inputs)
|
484 |
+
|
485 |
+
|
486 |
logits = outputs.logits
|
|
|
487 |
predictions = torch.argmax(logits, dim=2)
|
488 |
+
|
489 |
id2label = model.config.id2label
|
490 |
|
491 |
|
492 |
all_tokens = []
|
493 |
all_label_ids = []
|
494 |
all_labels = []
|
495 |
+
for i, sentence in enumerate(sentences_list):
|
496 |
|
497 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
|
498 |
label_ids = predictions[i].tolist()
|
|
|
532 |
i=i+1
|
533 |
print('new_tokens')
|
534 |
print(new_tokens[1])
|
535 |
+
#print(all_tokens[1])
|
536 |
|
537 |
print(len(new_tokens[1]))
|
538 |
print(len(new_identificadores[1]))
|
539 |
+
print(new_identificadores[1])
|
540 |
+
|
541 |
|
542 |
return new_identificadores, new_tokens
|
543 |
|