otozz commited on
Commit
d8d3f60
·
verified ·
1 Parent(s): cf2f889

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -45,7 +45,7 @@
45
  "scale_embedding": false,
46
  "suppress_tokens": [],
47
  "torch_dtype": "float32",
48
- "transformers_version": "4.39.3",
49
  "use_cache": true,
50
  "use_weighted_layer_sum": false,
51
  "vocab_size": 51865
 
45
  "scale_embedding": false,
46
  "suppress_tokens": [],
47
  "torch_dtype": "float32",
48
+ "transformers_version": "4.40.1",
49
  "use_cache": true,
50
  "use_weighted_layer_sum": false,
51
  "vocab_size": 51865
generation_config.json CHANGED
@@ -261,5 +261,5 @@
261
  "transcribe": 50359,
262
  "translate": 50358
263
  },
264
- "transformers_version": "4.39.3"
265
  }
 
261
  "transcribe": 50359,
262
  "translate": 50358
263
  },
264
+ "transformers_version": "4.40.1"
265
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db9e3410c0d181cbca233c2ef3a144140b3cc9f461300e2644223e31ce627ab
3
  size 966995080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c05576879734f8c7b8e89ca55acd5837783f70d88bc075bab3e5d5716e3782
3
  size 966995080
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5af96f34075315a8d145370c7ec1e9c2af1a07fc7c618f9842ab4b9f7b699e7
3
- size 1925050668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4c56cd3dff9876f616561350d28eafa2796e84ff54dea04dd73bca06bcf1e6c
3
+ size 1925064044
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:effb5f8ebfbf9b741e28f68a88bc81e631e8f69f509b47b422996451a4d48cad
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8c1ab21c342191c9c81110adcd7b6b9a742ee285eb232c27345529e5541c1f
3
+ size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51e3c4509f91cdbfa101ed1c8c1a68c474ae85392f457afeb413ff74d7331cc6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43eda48fd705f454e2d83b6fd6364eb3c4bc6f8ca22fff1070942ef66fa65a40
3
  size 1064
trainer_state.json CHANGED
@@ -1,652 +1,492 @@
1
  {
2
- "best_metric": 67.89809578166347,
3
- "best_model_checkpoint": "./whisper-small-dialect_maghrebi_seed168/checkpoint-1250",
4
- "epoch": 1.1983223487118035,
5
  "eval_steps": 250,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "grad_norm": 19.585220336914062,
14
  "learning_rate": 5.000000000000001e-07,
15
- "loss": 0.9335,
16
  "step": 25
17
  },
18
  {
19
- "epoch": 0.03,
20
- "grad_norm": 13.835809707641602,
21
  "learning_rate": 1.0000000000000002e-06,
22
- "loss": 0.8126,
23
  "step": 50
24
  },
25
  {
26
- "epoch": 0.04,
27
- "grad_norm": 17.269922256469727,
28
  "learning_rate": 1.5e-06,
29
- "loss": 0.7521,
30
  "step": 75
31
  },
32
  {
33
- "epoch": 0.06,
34
- "grad_norm": 15.5454683303833,
35
  "learning_rate": 2.0000000000000003e-06,
36
- "loss": 0.7946,
37
  "step": 100
38
  },
39
  {
40
- "epoch": 0.07,
41
- "grad_norm": 23.1411190032959,
42
  "learning_rate": 2.5e-06,
43
- "loss": 0.8514,
44
  "step": 125
45
  },
46
  {
47
- "epoch": 0.09,
48
- "grad_norm": 15.324691772460938,
49
  "learning_rate": 3e-06,
50
- "loss": 0.8826,
51
  "step": 150
52
  },
53
  {
54
- "epoch": 0.1,
55
- "grad_norm": 20.527490615844727,
56
  "learning_rate": 3.5e-06,
57
- "loss": 0.8558,
58
  "step": 175
59
  },
60
  {
61
- "epoch": 0.12,
62
- "grad_norm": 20.18839454650879,
63
  "learning_rate": 4.000000000000001e-06,
64
- "loss": 0.7788,
65
  "step": 200
66
  },
67
  {
68
- "epoch": 0.13,
69
- "grad_norm": 21.231355667114258,
70
  "learning_rate": 4.5e-06,
71
- "loss": 0.8753,
72
  "step": 225
73
  },
74
  {
75
- "epoch": 0.15,
76
- "grad_norm": 15.271389961242676,
77
  "learning_rate": 5e-06,
78
- "loss": 0.8565,
79
  "step": 250
80
  },
81
  {
82
- "epoch": 0.15,
83
- "eval_cer": 55.99867996039881,
84
- "eval_loss": 0.8528127670288086,
85
- "eval_runtime": 1766.4518,
86
- "eval_samples_per_second": 1.89,
87
- "eval_steps_per_second": 0.237,
88
- "eval_wer": 77.37249001193379,
89
  "step": 250
90
  },
91
  {
92
- "epoch": 0.16,
93
- "grad_norm": 25.388628005981445,
94
  "learning_rate": 5.500000000000001e-06,
95
- "loss": 0.8373,
96
  "step": 275
97
  },
98
  {
99
- "epoch": 0.18,
100
- "grad_norm": 23.94950294494629,
101
  "learning_rate": 6e-06,
102
- "loss": 0.8126,
103
  "step": 300
104
  },
105
  {
106
- "epoch": 0.19,
107
- "grad_norm": 17.08974838256836,
108
  "learning_rate": 6.5000000000000004e-06,
109
- "loss": 0.8618,
110
  "step": 325
111
  },
112
  {
113
- "epoch": 0.21,
114
- "grad_norm": 22.02919578552246,
115
  "learning_rate": 7e-06,
116
- "loss": 0.7687,
117
  "step": 350
118
  },
119
  {
120
- "epoch": 0.22,
121
- "grad_norm": 24.780494689941406,
122
  "learning_rate": 7.500000000000001e-06,
123
- "loss": 0.7859,
124
  "step": 375
125
  },
126
  {
127
- "epoch": 0.24,
128
- "grad_norm": 19.000207901000977,
129
  "learning_rate": 8.000000000000001e-06,
130
- "loss": 0.8823,
131
  "step": 400
132
  },
133
  {
134
- "epoch": 0.25,
135
- "grad_norm": 21.721498489379883,
136
  "learning_rate": 8.5e-06,
137
- "loss": 0.7591,
138
  "step": 425
139
  },
140
  {
141
- "epoch": 0.27,
142
- "grad_norm": 17.463626861572266,
143
  "learning_rate": 9e-06,
144
- "loss": 0.7987,
145
  "step": 450
146
  },
147
  {
148
- "epoch": 0.28,
149
- "grad_norm": 16.86531639099121,
150
  "learning_rate": 9.5e-06,
151
- "loss": 0.8929,
152
  "step": 475
153
  },
154
  {
155
- "epoch": 0.3,
156
- "grad_norm": 20.8975830078125,
157
  "learning_rate": 1e-05,
158
- "loss": 0.906,
159
  "step": 500
160
  },
161
  {
162
- "epoch": 0.3,
163
- "eval_cer": 57.98973969219077,
164
- "eval_loss": 0.8842513561248779,
165
- "eval_runtime": 1715.4755,
166
- "eval_samples_per_second": 1.946,
167
- "eval_steps_per_second": 0.244,
168
- "eval_wer": 83.06957920406786,
169
  "step": 500
170
  },
171
  {
172
- "epoch": 0.31,
173
- "grad_norm": 18.66194725036621,
174
  "learning_rate": 9.944444444444445e-06,
175
- "loss": 0.9078,
176
  "step": 525
177
  },
178
  {
179
- "epoch": 0.33,
180
- "grad_norm": 21.260807037353516,
181
  "learning_rate": 9.88888888888889e-06,
182
- "loss": 0.9685,
183
  "step": 550
184
  },
185
  {
186
- "epoch": 0.34,
187
- "grad_norm": 23.54132080078125,
188
  "learning_rate": 9.833333333333333e-06,
189
- "loss": 0.9875,
190
  "step": 575
191
  },
192
  {
193
- "epoch": 0.36,
194
- "grad_norm": 17.314271926879883,
195
  "learning_rate": 9.777777777777779e-06,
196
- "loss": 0.8779,
197
  "step": 600
198
  },
199
  {
200
- "epoch": 0.37,
201
- "grad_norm": 14.517264366149902,
202
  "learning_rate": 9.722222222222223e-06,
203
- "loss": 0.8386,
204
  "step": 625
205
  },
206
  {
207
- "epoch": 0.39,
208
- "grad_norm": 22.82421112060547,
209
  "learning_rate": 9.666666666666667e-06,
210
- "loss": 0.8628,
211
  "step": 650
212
  },
213
  {
214
- "epoch": 0.4,
215
- "grad_norm": 19.775588989257812,
216
  "learning_rate": 9.611111111111112e-06,
217
- "loss": 0.8895,
218
  "step": 675
219
  },
220
  {
221
- "epoch": 0.42,
222
- "grad_norm": 24.209157943725586,
223
  "learning_rate": 9.555555555555556e-06,
224
- "loss": 0.8814,
225
  "step": 700
226
  },
227
  {
228
- "epoch": 0.43,
229
- "grad_norm": 14.87100887298584,
230
  "learning_rate": 9.5e-06,
231
- "loss": 0.7837,
232
  "step": 725
233
  },
234
  {
235
- "epoch": 0.45,
236
- "grad_norm": 21.66838264465332,
237
  "learning_rate": 9.444444444444445e-06,
238
- "loss": 0.8671,
239
  "step": 750
240
  },
241
  {
242
- "epoch": 0.45,
243
- "eval_cer": 46.89840695220857,
244
- "eval_loss": 0.901644229888916,
245
- "eval_runtime": 1635.6088,
246
- "eval_samples_per_second": 2.041,
247
- "eval_steps_per_second": 0.256,
248
- "eval_wer": 68.61931199086804,
249
  "step": 750
250
  },
251
  {
252
- "epoch": 0.46,
253
- "grad_norm": 24.317537307739258,
254
  "learning_rate": 9.38888888888889e-06,
255
- "loss": 0.8958,
256
  "step": 775
257
  },
258
  {
259
- "epoch": 0.48,
260
- "grad_norm": 17.147018432617188,
261
  "learning_rate": 9.333333333333334e-06,
262
- "loss": 0.7864,
263
  "step": 800
264
  },
265
  {
266
- "epoch": 0.49,
267
- "grad_norm": 16.60604476928711,
268
  "learning_rate": 9.277777777777778e-06,
269
- "loss": 0.9282,
270
  "step": 825
271
  },
272
  {
273
- "epoch": 0.51,
274
- "grad_norm": 16.8157901763916,
275
  "learning_rate": 9.222222222222224e-06,
276
- "loss": 0.9055,
277
  "step": 850
278
  },
279
  {
280
- "epoch": 0.52,
281
- "grad_norm": 23.045429229736328,
282
  "learning_rate": 9.166666666666666e-06,
283
- "loss": 0.762,
284
  "step": 875
285
  },
286
  {
287
- "epoch": 0.54,
288
- "grad_norm": 20.419845581054688,
289
  "learning_rate": 9.111111111111112e-06,
290
- "loss": 0.9957,
291
  "step": 900
292
  },
293
  {
294
- "epoch": 0.55,
295
- "grad_norm": 12.999320030212402,
296
  "learning_rate": 9.055555555555556e-06,
297
- "loss": 0.8029,
298
  "step": 925
299
  },
300
  {
301
- "epoch": 0.57,
302
- "grad_norm": 19.02703094482422,
303
  "learning_rate": 9e-06,
304
- "loss": 0.844,
305
  "step": 950
306
  },
307
  {
308
- "epoch": 0.58,
309
- "grad_norm": 16.586179733276367,
310
  "learning_rate": 8.944444444444446e-06,
311
- "loss": 0.9192,
312
  "step": 975
313
  },
314
  {
315
- "epoch": 0.6,
316
- "grad_norm": 19.809429168701172,
317
  "learning_rate": 8.888888888888888e-06,
318
- "loss": 0.8423,
319
  "step": 1000
320
  },
321
  {
322
- "epoch": 0.6,
323
- "eval_cer": 56.56169685090553,
324
- "eval_loss": 0.8899121284484863,
325
- "eval_runtime": 1767.7691,
326
- "eval_samples_per_second": 1.888,
327
- "eval_steps_per_second": 0.236,
328
- "eval_wer": 72.81689410055519,
329
  "step": 1000
330
  },
331
  {
332
- "epoch": 0.61,
333
- "grad_norm": 18.651063919067383,
334
  "learning_rate": 8.833333333333334e-06,
335
- "loss": 0.885,
336
  "step": 1025
337
  },
338
  {
339
- "epoch": 0.63,
340
- "grad_norm": 16.94672203063965,
341
  "learning_rate": 8.777777777777778e-06,
342
- "loss": 0.9861,
343
  "step": 1050
344
  },
345
  {
346
- "epoch": 0.64,
347
- "grad_norm": 18.808040618896484,
348
  "learning_rate": 8.722222222222224e-06,
349
- "loss": 0.882,
350
  "step": 1075
351
  },
352
  {
353
- "epoch": 0.66,
354
- "grad_norm": 18.73299789428711,
355
  "learning_rate": 8.666666666666668e-06,
356
- "loss": 0.8396,
357
  "step": 1100
358
  },
359
  {
360
- "epoch": 0.67,
361
- "grad_norm": 15.416051864624023,
362
  "learning_rate": 8.611111111111112e-06,
363
- "loss": 0.8821,
364
  "step": 1125
365
  },
366
  {
367
- "epoch": 0.69,
368
- "grad_norm": 16.22429847717285,
369
  "learning_rate": 8.555555555555556e-06,
370
- "loss": 0.834,
371
  "step": 1150
372
  },
373
  {
374
- "epoch": 0.7,
375
- "grad_norm": 19.464521408081055,
376
  "learning_rate": 8.5e-06,
377
- "loss": 0.7611,
378
  "step": 1175
379
  },
380
  {
381
- "epoch": 0.72,
382
- "grad_norm": 19.693363189697266,
383
  "learning_rate": 8.444444444444446e-06,
384
- "loss": 0.8356,
385
  "step": 1200
386
  },
387
  {
388
- "epoch": 0.73,
389
- "grad_norm": 19.970016479492188,
390
  "learning_rate": 8.38888888888889e-06,
391
- "loss": 0.8501,
392
  "step": 1225
393
  },
394
  {
395
- "epoch": 0.75,
396
- "grad_norm": 17.509977340698242,
397
  "learning_rate": 8.333333333333334e-06,
398
- "loss": 0.8217,
399
  "step": 1250
400
  },
401
  {
402
- "epoch": 0.75,
403
- "eval_cer": 53.066591997759936,
404
- "eval_loss": 0.8848564624786377,
405
- "eval_runtime": 1728.777,
406
- "eval_samples_per_second": 1.931,
407
- "eval_steps_per_second": 0.242,
408
- "eval_wer": 67.89809578166347,
409
  "step": 1250
410
  },
411
  {
412
- "epoch": 0.76,
413
- "grad_norm": 19.724687576293945,
414
  "learning_rate": 8.277777777777778e-06,
415
- "loss": 0.8353,
416
  "step": 1275
417
  },
418
  {
419
- "epoch": 0.78,
420
- "grad_norm": 23.104583740234375,
421
  "learning_rate": 8.222222222222222e-06,
422
- "loss": 0.788,
423
  "step": 1300
424
  },
425
  {
426
- "epoch": 0.79,
427
- "grad_norm": 18.423364639282227,
428
  "learning_rate": 8.166666666666668e-06,
429
- "loss": 0.8572,
430
  "step": 1325
431
  },
432
  {
433
- "epoch": 0.81,
434
- "grad_norm": 20.901500701904297,
435
  "learning_rate": 8.111111111111112e-06,
436
- "loss": 0.8129,
437
  "step": 1350
438
  },
439
  {
440
- "epoch": 0.82,
441
- "grad_norm": 14.155868530273438,
442
  "learning_rate": 8.055555555555557e-06,
443
- "loss": 0.8769,
444
  "step": 1375
445
  },
446
  {
447
- "epoch": 0.84,
448
- "grad_norm": 15.407185554504395,
449
  "learning_rate": 8.000000000000001e-06,
450
- "loss": 0.742,
451
  "step": 1400
452
  },
453
  {
454
- "epoch": 0.85,
455
- "grad_norm": 27.230249404907227,
456
  "learning_rate": 7.944444444444445e-06,
457
- "loss": 0.8353,
458
  "step": 1425
459
  },
460
  {
461
- "epoch": 0.87,
462
- "grad_norm": 12.835943222045898,
463
  "learning_rate": 7.88888888888889e-06,
464
- "loss": 0.8611,
465
  "step": 1450
466
  },
467
  {
468
- "epoch": 0.88,
469
- "grad_norm": 22.758798599243164,
470
  "learning_rate": 7.833333333333333e-06,
471
- "loss": 0.864,
472
  "step": 1475
473
  },
474
  {
475
- "epoch": 0.9,
476
- "grad_norm": 15.53707218170166,
477
  "learning_rate": 7.77777777777778e-06,
478
- "loss": 0.8133,
479
  "step": 1500
480
  },
481
  {
482
- "epoch": 0.9,
483
- "eval_cer": 57.49372481174435,
484
- "eval_loss": 0.8766074776649475,
485
- "eval_runtime": 1775.9327,
486
- "eval_samples_per_second": 1.88,
487
- "eval_steps_per_second": 0.235,
488
- "eval_wer": 73.74046593680278,
489
  "step": 1500
490
- },
491
- {
492
- "epoch": 0.91,
493
- "grad_norm": 17.66630744934082,
494
- "learning_rate": 7.722222222222223e-06,
495
- "loss": 0.8151,
496
- "step": 1525
497
- },
498
- {
499
- "epoch": 0.93,
500
- "grad_norm": 11.854238510131836,
501
- "learning_rate": 7.666666666666667e-06,
502
- "loss": 0.8589,
503
- "step": 1550
504
- },
505
- {
506
- "epoch": 0.94,
507
- "grad_norm": 20.530500411987305,
508
- "learning_rate": 7.611111111111111e-06,
509
- "loss": 0.857,
510
- "step": 1575
511
- },
512
- {
513
- "epoch": 0.96,
514
- "grad_norm": 15.698258399963379,
515
- "learning_rate": 7.555555555555556e-06,
516
- "loss": 0.9099,
517
- "step": 1600
518
- },
519
- {
520
- "epoch": 0.97,
521
- "grad_norm": 19.62598419189453,
522
- "learning_rate": 7.500000000000001e-06,
523
- "loss": 0.876,
524
- "step": 1625
525
- },
526
- {
527
- "epoch": 0.99,
528
- "grad_norm": 21.638870239257812,
529
- "learning_rate": 7.444444444444445e-06,
530
- "loss": 0.8588,
531
- "step": 1650
532
- },
533
- {
534
- "epoch": 1.0,
535
- "grad_norm": 14.121610641479492,
536
- "learning_rate": 7.38888888888889e-06,
537
- "loss": 0.7806,
538
- "step": 1675
539
- },
540
- {
541
- "epoch": 1.02,
542
- "grad_norm": 13.002812385559082,
543
- "learning_rate": 7.333333333333333e-06,
544
- "loss": 0.5343,
545
- "step": 1700
546
- },
547
- {
548
- "epoch": 1.03,
549
- "grad_norm": 15.227038383483887,
550
- "learning_rate": 7.277777777777778e-06,
551
- "loss": 0.5724,
552
- "step": 1725
553
- },
554
- {
555
- "epoch": 1.05,
556
- "grad_norm": 14.572872161865234,
557
- "learning_rate": 7.222222222222223e-06,
558
- "loss": 0.5831,
559
- "step": 1750
560
- },
561
- {
562
- "epoch": 1.05,
563
- "eval_cer": 67.86403592107763,
564
- "eval_loss": 0.8728525042533875,
565
- "eval_runtime": 1895.6388,
566
- "eval_samples_per_second": 1.761,
567
- "eval_steps_per_second": 0.221,
568
- "eval_wer": 83.52617651637006,
569
- "step": 1750
570
- },
571
- {
572
- "epoch": 1.06,
573
- "grad_norm": 10.929813385009766,
574
- "learning_rate": 7.166666666666667e-06,
575
- "loss": 0.5448,
576
- "step": 1775
577
- },
578
- {
579
- "epoch": 1.08,
580
- "grad_norm": 14.270151138305664,
581
- "learning_rate": 7.111111111111112e-06,
582
- "loss": 0.5362,
583
- "step": 1800
584
- },
585
- {
586
- "epoch": 1.09,
587
- "grad_norm": 12.08752727508545,
588
- "learning_rate": 7.055555555555557e-06,
589
- "loss": 0.5602,
590
- "step": 1825
591
- },
592
- {
593
- "epoch": 1.11,
594
- "grad_norm": 13.561049461364746,
595
- "learning_rate": 7e-06,
596
- "loss": 0.5308,
597
- "step": 1850
598
- },
599
- {
600
- "epoch": 1.12,
601
- "grad_norm": 15.52322006225586,
602
- "learning_rate": 6.944444444444445e-06,
603
- "loss": 0.5554,
604
- "step": 1875
605
- },
606
- {
607
- "epoch": 1.14,
608
- "grad_norm": 12.2987699508667,
609
- "learning_rate": 6.88888888888889e-06,
610
- "loss": 0.5622,
611
- "step": 1900
612
- },
613
- {
614
- "epoch": 1.15,
615
- "grad_norm": 14.216822624206543,
616
- "learning_rate": 6.833333333333334e-06,
617
- "loss": 0.5674,
618
- "step": 1925
619
- },
620
- {
621
- "epoch": 1.17,
622
- "grad_norm": 12.9888916015625,
623
- "learning_rate": 6.777777777777779e-06,
624
- "loss": 0.5361,
625
- "step": 1950
626
- },
627
- {
628
- "epoch": 1.18,
629
- "grad_norm": 16.80472183227539,
630
- "learning_rate": 6.7222222222222235e-06,
631
- "loss": 0.5632,
632
- "step": 1975
633
- },
634
- {
635
- "epoch": 1.2,
636
- "grad_norm": 14.355997085571289,
637
- "learning_rate": 6.666666666666667e-06,
638
- "loss": 0.5589,
639
- "step": 2000
640
- },
641
- {
642
- "epoch": 1.2,
643
- "eval_cer": 66.10498314949449,
644
- "eval_loss": 0.8783968091011047,
645
- "eval_runtime": 1843.2079,
646
- "eval_samples_per_second": 1.811,
647
- "eval_steps_per_second": 0.227,
648
- "eval_wer": 86.54075649872878,
649
- "step": 2000
650
  }
651
  ],
652
  "logging_steps": 25,
@@ -654,7 +494,7 @@
654
  "num_input_tokens_seen": 0,
655
  "num_train_epochs": 3,
656
  "save_steps": 250,
657
- "total_flos": 4.61736640512e+18,
658
  "train_batch_size": 8,
659
  "trial_name": null,
660
  "trial_params": null
 
1
  {
2
+ "best_metric": 82.81533751880869,
3
+ "best_model_checkpoint": "/scratch/p310333/whisper-small-dialect_maghrebi_seed168/checkpoint-750",
4
+ "epoch": 0.8987417615338527,
5
  "eval_steps": 250,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.014979029358897543,
13
+ "grad_norm": 25.04779052734375,
14
  "learning_rate": 5.000000000000001e-07,
15
+ "loss": 1.4944,
16
  "step": 25
17
  },
18
  {
19
+ "epoch": 0.029958058717795086,
20
+ "grad_norm": 19.618623733520508,
21
  "learning_rate": 1.0000000000000002e-06,
22
+ "loss": 1.3038,
23
  "step": 50
24
  },
25
  {
26
+ "epoch": 0.04493708807669263,
27
+ "grad_norm": 23.487409591674805,
28
  "learning_rate": 1.5e-06,
29
+ "loss": 1.1837,
30
  "step": 75
31
  },
32
  {
33
+ "epoch": 0.05991611743559017,
34
+ "grad_norm": 23.077299118041992,
35
  "learning_rate": 2.0000000000000003e-06,
36
+ "loss": 1.1554,
37
  "step": 100
38
  },
39
  {
40
+ "epoch": 0.07489514679448772,
41
+ "grad_norm": 26.876270294189453,
42
  "learning_rate": 2.5e-06,
43
+ "loss": 1.1074,
44
  "step": 125
45
  },
46
  {
47
+ "epoch": 0.08987417615338526,
48
+ "grad_norm": 19.345041275024414,
49
  "learning_rate": 3e-06,
50
+ "loss": 1.0981,
51
  "step": 150
52
  },
53
  {
54
+ "epoch": 0.1048532055122828,
55
+ "grad_norm": 24.393747329711914,
56
  "learning_rate": 3.5e-06,
57
+ "loss": 1.0891,
58
  "step": 175
59
  },
60
  {
61
+ "epoch": 0.11983223487118035,
62
+ "grad_norm": 21.781496047973633,
63
  "learning_rate": 4.000000000000001e-06,
64
+ "loss": 0.9027,
65
  "step": 200
66
  },
67
  {
68
+ "epoch": 0.1348112642300779,
69
+ "grad_norm": 25.396799087524414,
70
  "learning_rate": 4.5e-06,
71
+ "loss": 0.9823,
72
  "step": 225
73
  },
74
  {
75
+ "epoch": 0.14979029358897544,
76
+ "grad_norm": 13.59937858581543,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.8868,
79
  "step": 250
80
  },
81
  {
82
+ "epoch": 0.14979029358897544,
83
+ "eval_cer": 69.79109373281199,
84
+ "eval_loss": 1.4167897701263428,
85
+ "eval_runtime": 797.0114,
86
+ "eval_samples_per_second": 4.188,
87
+ "eval_steps_per_second": 0.524,
88
+ "eval_wer": 92.91236444767291,
89
  "step": 250
90
  },
91
  {
92
+ "epoch": 0.16476932294787297,
93
+ "grad_norm": 23.348133087158203,
94
  "learning_rate": 5.500000000000001e-06,
95
+ "loss": 0.778,
96
  "step": 275
97
  },
98
  {
99
+ "epoch": 0.17974835230677053,
100
+ "grad_norm": 24.561851501464844,
101
  "learning_rate": 6e-06,
102
+ "loss": 0.7693,
103
  "step": 300
104
  },
105
  {
106
+ "epoch": 0.19472738166566805,
107
+ "grad_norm": 16.083293914794922,
108
  "learning_rate": 6.5000000000000004e-06,
109
+ "loss": 0.7149,
110
  "step": 325
111
  },
112
  {
113
+ "epoch": 0.2097064110245656,
114
+ "grad_norm": 20.850666046142578,
115
  "learning_rate": 7e-06,
116
+ "loss": 0.6737,
117
  "step": 350
118
  },
119
  {
120
+ "epoch": 0.22468544038346316,
121
+ "grad_norm": 20.03885269165039,
122
  "learning_rate": 7.500000000000001e-06,
123
+ "loss": 0.5618,
124
  "step": 375
125
  },
126
  {
127
+ "epoch": 0.2396644697423607,
128
+ "grad_norm": 16.334306716918945,
129
  "learning_rate": 8.000000000000001e-06,
130
+ "loss": 0.7037,
131
  "step": 400
132
  },
133
  {
134
+ "epoch": 0.2546434991012582,
135
+ "grad_norm": 14.770045280456543,
136
  "learning_rate": 8.5e-06,
137
+ "loss": 0.5352,
138
  "step": 425
139
  },
140
  {
141
+ "epoch": 0.2696225284601558,
142
+ "grad_norm": 15.933141708374023,
143
  "learning_rate": 9e-06,
144
+ "loss": 0.5398,
145
  "step": 450
146
  },
147
  {
148
+ "epoch": 0.28460155781905333,
149
+ "grad_norm": 11.69640827178955,
150
  "learning_rate": 9.5e-06,
151
+ "loss": 0.6248,
152
  "step": 475
153
  },
154
  {
155
+ "epoch": 0.2995805871779509,
156
+ "grad_norm": 26.167146682739258,
157
  "learning_rate": 1e-05,
158
+ "loss": 0.6822,
159
  "step": 500
160
  },
161
  {
162
+ "epoch": 0.2995805871779509,
163
+ "eval_cer": 81.20443613308399,
164
+ "eval_loss": 1.5288528203964233,
165
+ "eval_runtime": 844.5862,
166
+ "eval_samples_per_second": 3.952,
167
+ "eval_steps_per_second": 0.495,
168
+ "eval_wer": 98.19955377989935,
169
  "step": 500
170
  },
171
  {
172
+ "epoch": 0.3145596165368484,
173
+ "grad_norm": 26.101837158203125,
174
  "learning_rate": 9.944444444444445e-06,
175
+ "loss": 0.9889,
176
  "step": 525
177
  },
178
  {
179
+ "epoch": 0.32953864589574594,
180
+ "grad_norm": 21.356121063232422,
181
  "learning_rate": 9.88888888888889e-06,
182
+ "loss": 1.1228,
183
  "step": 550
184
  },
185
  {
186
+ "epoch": 0.3445176752546435,
187
+ "grad_norm": 25.74348258972168,
188
  "learning_rate": 9.833333333333333e-06,
189
+ "loss": 1.0572,
190
  "step": 575
191
  },
192
  {
193
+ "epoch": 0.35949670461354105,
194
+ "grad_norm": 19.001972198486328,
195
  "learning_rate": 9.777777777777779e-06,
196
+ "loss": 0.9565,
197
  "step": 600
198
  },
199
  {
200
+ "epoch": 0.3744757339724386,
201
+ "grad_norm": 17.986818313598633,
202
  "learning_rate": 9.722222222222223e-06,
203
+ "loss": 0.9291,
204
  "step": 625
205
  },
206
  {
207
+ "epoch": 0.3894547633313361,
208
+ "grad_norm": 21.8798770904541,
209
  "learning_rate": 9.666666666666667e-06,
210
+ "loss": 0.9225,
211
  "step": 650
212
  },
213
  {
214
+ "epoch": 0.40443379269023366,
215
+ "grad_norm": 20.887165069580078,
216
  "learning_rate": 9.611111111111112e-06,
217
+ "loss": 0.9619,
218
  "step": 675
219
  },
220
  {
221
+ "epoch": 0.4194128220491312,
222
+ "grad_norm": 26.817380905151367,
223
  "learning_rate": 9.555555555555556e-06,
224
+ "loss": 0.9518,
225
  "step": 700
226
  },
227
  {
228
+ "epoch": 0.4343918514080288,
229
+ "grad_norm": 15.77753734588623,
230
  "learning_rate": 9.5e-06,
231
+ "loss": 0.8798,
232
  "step": 725
233
  },
234
  {
235
+ "epoch": 0.44937088076692633,
236
+ "grad_norm": 29.78970718383789,
237
  "learning_rate": 9.444444444444445e-06,
238
+ "loss": 1.1209,
239
  "step": 750
240
  },
241
  {
242
+ "epoch": 0.44937088076692633,
243
+ "eval_cer": 56.33469004070122,
244
+ "eval_loss": 1.4300882816314697,
245
+ "eval_runtime": 707.5486,
246
+ "eval_samples_per_second": 4.718,
247
+ "eval_steps_per_second": 0.591,
248
+ "eval_wer": 82.81533751880869,
249
  "step": 750
250
  },
251
  {
252
+ "epoch": 0.46434991012582383,
253
+ "grad_norm": 28.685291290283203,
254
  "learning_rate": 9.38888888888889e-06,
255
+ "loss": 1.4197,
256
  "step": 775
257
  },
258
  {
259
+ "epoch": 0.4793289394847214,
260
+ "grad_norm": 21.239105224609375,
261
  "learning_rate": 9.333333333333334e-06,
262
+ "loss": 1.2822,
263
  "step": 800
264
  },
265
  {
266
+ "epoch": 0.49430796884361894,
267
+ "grad_norm": 23.61458396911621,
268
  "learning_rate": 9.277777777777778e-06,
269
+ "loss": 1.473,
270
  "step": 825
271
  },
272
  {
273
+ "epoch": 0.5092869982025164,
274
+ "grad_norm": 22.278364181518555,
275
  "learning_rate": 9.222222222222224e-06,
276
+ "loss": 1.3492,
277
  "step": 850
278
  },
279
  {
280
+ "epoch": 0.524266027561414,
281
+ "grad_norm": 27.96851921081543,
282
  "learning_rate": 9.166666666666666e-06,
283
+ "loss": 1.34,
284
  "step": 875
285
  },
286
  {
287
+ "epoch": 0.5392450569203115,
288
+ "grad_norm": 25.204416275024414,
289
  "learning_rate": 9.111111111111112e-06,
290
+ "loss": 1.5126,
291
  "step": 900
292
  },
293
  {
294
+ "epoch": 0.5542240862792092,
295
+ "grad_norm": 18.80275535583496,
296
  "learning_rate": 9.055555555555556e-06,
297
+ "loss": 1.2935,
298
  "step": 925
299
  },
300
  {
301
+ "epoch": 0.5692031156381067,
302
+ "grad_norm": 22.869731903076172,
303
  "learning_rate": 9e-06,
304
+ "loss": 1.3413,
305
  "step": 950
306
  },
307
  {
308
+ "epoch": 0.5841821449970042,
309
+ "grad_norm": 22.38252067565918,
310
  "learning_rate": 8.944444444444446e-06,
311
+ "loss": 1.4085,
312
  "step": 975
313
  },
314
  {
315
+ "epoch": 0.5991611743559018,
316
+ "grad_norm": 24.1107120513916,
317
  "learning_rate": 8.888888888888888e-06,
318
+ "loss": 1.3236,
319
  "step": 1000
320
  },
321
  {
322
+ "epoch": 0.5991611743559018,
323
+ "eval_cer": 66.34499034971049,
324
+ "eval_loss": 1.349813461303711,
325
+ "eval_runtime": 764.465,
326
+ "eval_samples_per_second": 4.366,
327
+ "eval_steps_per_second": 0.547,
328
+ "eval_wer": 88.46054065272662,
329
  "step": 1000
330
  },
331
  {
332
+ "epoch": 0.6141402037147993,
333
+ "grad_norm": 29.156917572021484,
334
  "learning_rate": 8.833333333333334e-06,
335
+ "loss": 1.3896,
336
  "step": 1025
337
  },
338
  {
339
+ "epoch": 0.6291192330736968,
340
+ "grad_norm": 22.938846588134766,
341
  "learning_rate": 8.777777777777778e-06,
342
+ "loss": 1.4654,
343
  "step": 1050
344
  },
345
  {
346
+ "epoch": 0.6440982624325944,
347
+ "grad_norm": 25.43636703491211,
348
  "learning_rate": 8.722222222222224e-06,
349
+ "loss": 1.3499,
350
  "step": 1075
351
  },
352
  {
353
+ "epoch": 0.6590772917914919,
354
+ "grad_norm": 25.42586326599121,
355
  "learning_rate": 8.666666666666668e-06,
356
+ "loss": 1.2745,
357
  "step": 1100
358
  },
359
  {
360
+ "epoch": 0.6740563211503895,
361
+ "grad_norm": 23.000181198120117,
362
  "learning_rate": 8.611111111111112e-06,
363
+ "loss": 1.3675,
364
  "step": 1125
365
  },
366
  {
367
+ "epoch": 0.689035350509287,
368
+ "grad_norm": 23.148645401000977,
369
  "learning_rate": 8.555555555555556e-06,
370
+ "loss": 1.2538,
371
  "step": 1150
372
  },
373
  {
374
+ "epoch": 0.7040143798681845,
375
+ "grad_norm": 22.15021514892578,
376
  "learning_rate": 8.5e-06,
377
+ "loss": 1.2017,
378
  "step": 1175
379
  },
380
  {
381
+ "epoch": 0.7189934092270821,
382
+ "grad_norm": 25.774471282958984,
383
  "learning_rate": 8.444444444444446e-06,
384
+ "loss": 1.2524,
385
  "step": 1200
386
  },
387
  {
388
+ "epoch": 0.7339724385859796,
389
+ "grad_norm": 22.58797836303711,
390
  "learning_rate": 8.38888888888889e-06,
391
+ "loss": 1.2871,
392
  "step": 1225
393
  },
394
  {
395
+ "epoch": 0.7489514679448772,
396
+ "grad_norm": 20.87167739868164,
397
  "learning_rate": 8.333333333333334e-06,
398
+ "loss": 1.2571,
399
  "step": 1250
400
  },
401
  {
402
+ "epoch": 0.7489514679448772,
403
+ "eval_cer": 75.80427412822385,
404
+ "eval_loss": 1.3092447519302368,
405
+ "eval_runtime": 785.5014,
406
+ "eval_samples_per_second": 4.25,
407
+ "eval_steps_per_second": 0.532,
408
+ "eval_wer": 94.10574378664452,
409
  "step": 1250
410
  },
411
  {
412
+ "epoch": 0.7639304973037747,
413
+ "grad_norm": 23.92608642578125,
414
  "learning_rate": 8.277777777777778e-06,
415
+ "loss": 1.2208,
416
  "step": 1275
417
  },
418
  {
419
+ "epoch": 0.7789095266626722,
420
+ "grad_norm": 27.465702056884766,
421
  "learning_rate": 8.222222222222222e-06,
422
+ "loss": 1.209,
423
  "step": 1300
424
  },
425
  {
426
+ "epoch": 0.7938885560215698,
427
+ "grad_norm": 19.662220001220703,
428
  "learning_rate": 8.166666666666668e-06,
429
+ "loss": 1.2772,
430
  "step": 1325
431
  },
432
  {
433
+ "epoch": 0.8088675853804673,
434
+ "grad_norm": 24.321632385253906,
435
  "learning_rate": 8.111111111111112e-06,
436
+ "loss": 1.1859,
437
  "step": 1350
438
  },
439
  {
440
+ "epoch": 0.8238466147393649,
441
+ "grad_norm": 22.470956802368164,
442
  "learning_rate": 8.055555555555557e-06,
443
+ "loss": 1.291,
444
  "step": 1375
445
  },
446
  {
447
+ "epoch": 0.8388256440982624,
448
+ "grad_norm": 20.14983558654785,
449
  "learning_rate": 8.000000000000001e-06,
450
+ "loss": 1.1293,
451
  "step": 1400
452
  },
453
  {
454
+ "epoch": 0.8538046734571599,
455
+ "grad_norm": 33.752967834472656,
456
  "learning_rate": 7.944444444444445e-06,
457
+ "loss": 1.1965,
458
  "step": 1425
459
  },
460
  {
461
+ "epoch": 0.8687837028160575,
462
+ "grad_norm": 20.359806060791016,
463
  "learning_rate": 7.88888888888889e-06,
464
+ "loss": 1.2509,
465
  "step": 1450
466
  },
467
  {
468
+ "epoch": 0.883762732174955,
469
+ "grad_norm": 25.176883697509766,
470
  "learning_rate": 7.833333333333333e-06,
471
+ "loss": 1.2518,
472
  "step": 1475
473
  },
474
  {
475
+ "epoch": 0.8987417615338527,
476
+ "grad_norm": 19.36232566833496,
477
  "learning_rate": 7.77777777777778e-06,
478
+ "loss": 1.1987,
479
  "step": 1500
480
  },
481
  {
482
+ "epoch": 0.8987417615338527,
483
+ "eval_cer": 66.07898236947108,
484
+ "eval_loss": 1.273741602897644,
485
+ "eval_runtime": 744.3637,
486
+ "eval_samples_per_second": 4.484,
487
+ "eval_steps_per_second": 0.562,
488
+ "eval_wer": 88.43978622944016,
489
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  }
491
  ],
492
  "logging_steps": 25,
 
494
  "num_input_tokens_seen": 0,
495
  "num_train_epochs": 3,
496
  "save_steps": 250,
497
+ "total_flos": 3.46302480384e+18,
498
  "train_batch_size": 8,
499
  "trial_name": null,
500
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1771d4d1fd9d29967100bf2ad3782b43296aace23affbca132bd14b0d7038c32
3
- size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4799acc98dfb5251f5598d709ba726baeb0f588a532deb4aa6e7f0d068fcb5f
3
+ size 5240