dayannex commited on
Commit
7b68d5a
·
1 Parent(s): fb25947

dataset model csv ingles

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -477,27 +477,22 @@ class ModeloDataset:
477
  print('idioma:',idioma)
478
  self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
479
  self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
480
- print('_sentences',_sentences)
481
- print('type(_sentences)',type(_sentences))
482
- inputs = self.tokenizer(_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
483
- print('inputs')
484
- #model.eval()
485
-
486
  with torch.no_grad():
487
  outputs = model(**inputs)
488
- print('out')
489
-
490
  logits = outputs.logits
491
- print('logits',logits)
492
  predictions = torch.argmax(logits, dim=2)
493
-
494
  id2label = model.config.id2label
495
 
496
 
497
  all_tokens = []
498
  all_label_ids = []
499
  all_labels = []
500
- for i, sentence in enumerate(_sentences):
501
 
502
  tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
503
  label_ids = predictions[i].tolist()
@@ -537,11 +532,12 @@ class ModeloDataset:
537
  i=i+1
538
  print('new_tokens')
539
  print(new_tokens[1])
540
- print(all_tokens[1])
541
 
542
  print(len(new_tokens[1]))
543
  print(len(new_identificadores[1]))
544
-
 
545
 
546
  return new_identificadores, new_tokens
547
 
 
477
  print('idioma:',idioma)
478
  self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
479
  self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
480
+ sentences_list = _sentences.apply(lambda x: x[0].tolist() if isinstance(x, np.ndarray) else x.tolist())
481
+ inputs = self.tokenizer(list(sentences_list), padding=True, truncation=True, return_tensors="pt", max_length=512)
 
 
 
 
482
  with torch.no_grad():
483
  outputs = model(**inputs)
484
+
485
+
486
  logits = outputs.logits
 
487
  predictions = torch.argmax(logits, dim=2)
488
+
489
  id2label = model.config.id2label
490
 
491
 
492
  all_tokens = []
493
  all_label_ids = []
494
  all_labels = []
495
+ for i, sentence in enumerate(sentences_list):
496
 
497
  tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
498
  label_ids = predictions[i].tolist()
 
532
  i=i+1
533
  print('new_tokens')
534
  print(new_tokens[1])
535
+ #print(all_tokens[1])
536
 
537
  print(len(new_tokens[1]))
538
  print(len(new_identificadores[1]))
539
+ print(new_identificadores[1])
540
+
541
 
542
  return new_identificadores, new_tokens
543