Shawn Tan
commited on
Commit
·
f5eaf34
1
Parent(s):
bf62d95
Cleanup config.
Browse files- config.json +40 -80
config.json
CHANGED
@@ -308,8 +308,7 @@
|
|
308 |
"attention_multiplier": 0.0078125,
|
309 |
"dropout": 0,
|
310 |
"num_key_value_heads": 4,
|
311 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
312 |
-
|
313 |
},
|
314 |
{
|
315 |
"add_bias": false,
|
@@ -317,8 +316,7 @@
|
|
317 |
"attention_multiplier": 0.0078125,
|
318 |
"dropout": 0,
|
319 |
"num_key_value_heads": 4,
|
320 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
321 |
-
|
322 |
},
|
323 |
{
|
324 |
"add_bias": false,
|
@@ -326,8 +324,7 @@
|
|
326 |
"attention_multiplier": 0.0078125,
|
327 |
"dropout": 0,
|
328 |
"num_key_value_heads": 4,
|
329 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
330 |
-
|
331 |
},
|
332 |
{
|
333 |
"add_bias": false,
|
@@ -335,8 +332,7 @@
|
|
335 |
"attention_multiplier": 0.0078125,
|
336 |
"dropout": 0,
|
337 |
"num_key_value_heads": 4,
|
338 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
339 |
-
|
340 |
},
|
341 |
{
|
342 |
"add_bias": false,
|
@@ -344,8 +340,7 @@
|
|
344 |
"attention_multiplier": 0.0078125,
|
345 |
"dropout": 0,
|
346 |
"num_key_value_heads": 4,
|
347 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
348 |
-
|
349 |
},
|
350 |
{
|
351 |
"add_bias": false,
|
@@ -353,8 +348,7 @@
|
|
353 |
"attention_multiplier": 0.0078125,
|
354 |
"dropout": 0,
|
355 |
"num_key_value_heads": 4,
|
356 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
357 |
-
|
358 |
},
|
359 |
{
|
360 |
"add_bias": false,
|
@@ -362,8 +356,7 @@
|
|
362 |
"attention_multiplier": 0.0078125,
|
363 |
"dropout": 0,
|
364 |
"num_key_value_heads": 4,
|
365 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
366 |
-
|
367 |
},
|
368 |
{
|
369 |
"add_bias": false,
|
@@ -371,8 +364,7 @@
|
|
371 |
"attention_multiplier": 0.0078125,
|
372 |
"dropout": 0,
|
373 |
"num_key_value_heads": 4,
|
374 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
375 |
-
|
376 |
},
|
377 |
{
|
378 |
"add_bias": false,
|
@@ -380,8 +372,7 @@
|
|
380 |
"attention_multiplier": 0.0078125,
|
381 |
"dropout": 0,
|
382 |
"num_key_value_heads": 4,
|
383 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
384 |
-
|
385 |
},
|
386 |
{
|
387 |
"add_bias": false,
|
@@ -389,8 +380,7 @@
|
|
389 |
"attention_multiplier": 0.0078125,
|
390 |
"dropout": 0,
|
391 |
"num_key_value_heads": 4,
|
392 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
393 |
-
|
394 |
},
|
395 |
{
|
396 |
"add_bias": false,
|
@@ -398,8 +388,7 @@
|
|
398 |
"attention_multiplier": 0.0078125,
|
399 |
"dropout": 0,
|
400 |
"num_key_value_heads": 4,
|
401 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
402 |
-
|
403 |
},
|
404 |
{
|
405 |
"add_bias": false,
|
@@ -407,8 +396,7 @@
|
|
407 |
"attention_multiplier": 0.0078125,
|
408 |
"dropout": 0,
|
409 |
"num_key_value_heads": 4,
|
410 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
411 |
-
|
412 |
},
|
413 |
{
|
414 |
"add_bias": false,
|
@@ -416,8 +404,7 @@
|
|
416 |
"attention_multiplier": 0.0078125,
|
417 |
"dropout": 0,
|
418 |
"num_key_value_heads": 4,
|
419 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
420 |
-
|
421 |
},
|
422 |
{
|
423 |
"add_bias": false,
|
@@ -425,8 +412,7 @@
|
|
425 |
"attention_multiplier": 0.0078125,
|
426 |
"dropout": 0,
|
427 |
"num_key_value_heads": 4,
|
428 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
429 |
-
|
430 |
},
|
431 |
{
|
432 |
"add_bias": false,
|
@@ -434,8 +420,7 @@
|
|
434 |
"attention_multiplier": 0.0078125,
|
435 |
"dropout": 0,
|
436 |
"num_key_value_heads": 4,
|
437 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
438 |
-
|
439 |
},
|
440 |
{
|
441 |
"add_bias": false,
|
@@ -443,8 +428,7 @@
|
|
443 |
"attention_multiplier": 0.0078125,
|
444 |
"dropout": 0,
|
445 |
"num_key_value_heads": 4,
|
446 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
447 |
-
|
448 |
},
|
449 |
{
|
450 |
"add_bias": false,
|
@@ -452,8 +436,7 @@
|
|
452 |
"attention_multiplier": 0.0078125,
|
453 |
"dropout": 0,
|
454 |
"num_key_value_heads": 4,
|
455 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
456 |
-
|
457 |
},
|
458 |
{
|
459 |
"add_bias": false,
|
@@ -461,8 +444,7 @@
|
|
461 |
"attention_multiplier": 0.0078125,
|
462 |
"dropout": 0,
|
463 |
"num_key_value_heads": 4,
|
464 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
465 |
-
|
466 |
},
|
467 |
{
|
468 |
"add_bias": false,
|
@@ -470,8 +452,7 @@
|
|
470 |
"attention_multiplier": 0.0078125,
|
471 |
"dropout": 0,
|
472 |
"num_key_value_heads": 4,
|
473 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
474 |
-
|
475 |
},
|
476 |
{
|
477 |
"add_bias": false,
|
@@ -479,8 +460,7 @@
|
|
479 |
"attention_multiplier": 0.0078125,
|
480 |
"dropout": 0,
|
481 |
"num_key_value_heads": 4,
|
482 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
483 |
-
|
484 |
},
|
485 |
{
|
486 |
"add_bias": false,
|
@@ -488,8 +468,7 @@
|
|
488 |
"attention_multiplier": 0.0078125,
|
489 |
"dropout": 0,
|
490 |
"num_key_value_heads": 4,
|
491 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
492 |
-
|
493 |
},
|
494 |
{
|
495 |
"add_bias": false,
|
@@ -497,8 +476,7 @@
|
|
497 |
"attention_multiplier": 0.0078125,
|
498 |
"dropout": 0,
|
499 |
"num_key_value_heads": 4,
|
500 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
501 |
-
|
502 |
},
|
503 |
{
|
504 |
"add_bias": false,
|
@@ -506,8 +484,7 @@
|
|
506 |
"attention_multiplier": 0.0078125,
|
507 |
"dropout": 0,
|
508 |
"num_key_value_heads": 4,
|
509 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
510 |
-
|
511 |
},
|
512 |
{
|
513 |
"add_bias": false,
|
@@ -515,8 +492,7 @@
|
|
515 |
"attention_multiplier": 0.0078125,
|
516 |
"dropout": 0,
|
517 |
"num_key_value_heads": 4,
|
518 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
519 |
-
|
520 |
},
|
521 |
{
|
522 |
"add_bias": false,
|
@@ -524,8 +500,7 @@
|
|
524 |
"attention_multiplier": 0.0078125,
|
525 |
"dropout": 0,
|
526 |
"num_key_value_heads": 4,
|
527 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
528 |
-
|
529 |
},
|
530 |
{
|
531 |
"add_bias": false,
|
@@ -533,8 +508,7 @@
|
|
533 |
"attention_multiplier": 0.0078125,
|
534 |
"dropout": 0,
|
535 |
"num_key_value_heads": 4,
|
536 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
537 |
-
|
538 |
},
|
539 |
{
|
540 |
"add_bias": false,
|
@@ -542,8 +516,7 @@
|
|
542 |
"attention_multiplier": 0.0078125,
|
543 |
"dropout": 0,
|
544 |
"num_key_value_heads": 4,
|
545 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
546 |
-
|
547 |
},
|
548 |
{
|
549 |
"add_bias": false,
|
@@ -551,8 +524,7 @@
|
|
551 |
"attention_multiplier": 0.0078125,
|
552 |
"dropout": 0,
|
553 |
"num_key_value_heads": 4,
|
554 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
555 |
-
|
556 |
},
|
557 |
{
|
558 |
"add_bias": false,
|
@@ -560,8 +532,7 @@
|
|
560 |
"attention_multiplier": 0.0078125,
|
561 |
"dropout": 0,
|
562 |
"num_key_value_heads": 4,
|
563 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
564 |
-
|
565 |
},
|
566 |
{
|
567 |
"add_bias": false,
|
@@ -569,8 +540,7 @@
|
|
569 |
"attention_multiplier": 0.0078125,
|
570 |
"dropout": 0,
|
571 |
"num_key_value_heads": 4,
|
572 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
573 |
-
|
574 |
},
|
575 |
{
|
576 |
"add_bias": false,
|
@@ -578,8 +548,7 @@
|
|
578 |
"attention_multiplier": 0.0078125,
|
579 |
"dropout": 0,
|
580 |
"num_key_value_heads": 4,
|
581 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
582 |
-
|
583 |
},
|
584 |
{
|
585 |
"add_bias": false,
|
@@ -587,8 +556,7 @@
|
|
587 |
"attention_multiplier": 0.0078125,
|
588 |
"dropout": 0,
|
589 |
"num_key_value_heads": 4,
|
590 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
591 |
-
|
592 |
},
|
593 |
{
|
594 |
"add_bias": false,
|
@@ -596,8 +564,7 @@
|
|
596 |
"attention_multiplier": 0.0078125,
|
597 |
"dropout": 0,
|
598 |
"num_key_value_heads": 4,
|
599 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
600 |
-
|
601 |
},
|
602 |
{
|
603 |
"add_bias": false,
|
@@ -605,8 +572,7 @@
|
|
605 |
"attention_multiplier": 0.0078125,
|
606 |
"dropout": 0,
|
607 |
"num_key_value_heads": 4,
|
608 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
609 |
-
|
610 |
},
|
611 |
{
|
612 |
"add_bias": false,
|
@@ -614,8 +580,7 @@
|
|
614 |
"attention_multiplier": 0.0078125,
|
615 |
"dropout": 0,
|
616 |
"num_key_value_heads": 4,
|
617 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
618 |
-
|
619 |
},
|
620 |
{
|
621 |
"add_bias": false,
|
@@ -623,8 +588,7 @@
|
|
623 |
"attention_multiplier": 0.0078125,
|
624 |
"dropout": 0,
|
625 |
"num_key_value_heads": 4,
|
626 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
627 |
-
|
628 |
},
|
629 |
{
|
630 |
"add_bias": false,
|
@@ -632,8 +596,7 @@
|
|
632 |
"attention_multiplier": 0.0078125,
|
633 |
"dropout": 0,
|
634 |
"num_key_value_heads": 4,
|
635 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
636 |
-
|
637 |
},
|
638 |
{
|
639 |
"add_bias": false,
|
@@ -641,8 +604,7 @@
|
|
641 |
"attention_multiplier": 0.0078125,
|
642 |
"dropout": 0,
|
643 |
"num_key_value_heads": 4,
|
644 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
645 |
-
|
646 |
},
|
647 |
{
|
648 |
"add_bias": false,
|
@@ -650,8 +612,7 @@
|
|
650 |
"attention_multiplier": 0.0078125,
|
651 |
"dropout": 0,
|
652 |
"num_key_value_heads": 4,
|
653 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
654 |
-
|
655 |
},
|
656 |
{
|
657 |
"add_bias": false,
|
@@ -659,8 +620,7 @@
|
|
659 |
"attention_multiplier": 0.0078125,
|
660 |
"dropout": 0,
|
661 |
"num_key_value_heads": 4,
|
662 |
-
"sequence_mixer_type": "stickbreaking_attention"
|
663 |
-
|
664 |
}
|
665 |
],
|
666 |
"transformers_version": "4.49.0.dev0",
|
|
|
308 |
"attention_multiplier": 0.0078125,
|
309 |
"dropout": 0,
|
310 |
"num_key_value_heads": 4,
|
311 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
312 |
},
|
313 |
{
|
314 |
"add_bias": false,
|
|
|
316 |
"attention_multiplier": 0.0078125,
|
317 |
"dropout": 0,
|
318 |
"num_key_value_heads": 4,
|
319 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
320 |
},
|
321 |
{
|
322 |
"add_bias": false,
|
|
|
324 |
"attention_multiplier": 0.0078125,
|
325 |
"dropout": 0,
|
326 |
"num_key_value_heads": 4,
|
327 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
328 |
},
|
329 |
{
|
330 |
"add_bias": false,
|
|
|
332 |
"attention_multiplier": 0.0078125,
|
333 |
"dropout": 0,
|
334 |
"num_key_value_heads": 4,
|
335 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
336 |
},
|
337 |
{
|
338 |
"add_bias": false,
|
|
|
340 |
"attention_multiplier": 0.0078125,
|
341 |
"dropout": 0,
|
342 |
"num_key_value_heads": 4,
|
343 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
344 |
},
|
345 |
{
|
346 |
"add_bias": false,
|
|
|
348 |
"attention_multiplier": 0.0078125,
|
349 |
"dropout": 0,
|
350 |
"num_key_value_heads": 4,
|
351 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
352 |
},
|
353 |
{
|
354 |
"add_bias": false,
|
|
|
356 |
"attention_multiplier": 0.0078125,
|
357 |
"dropout": 0,
|
358 |
"num_key_value_heads": 4,
|
359 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
360 |
},
|
361 |
{
|
362 |
"add_bias": false,
|
|
|
364 |
"attention_multiplier": 0.0078125,
|
365 |
"dropout": 0,
|
366 |
"num_key_value_heads": 4,
|
367 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
368 |
},
|
369 |
{
|
370 |
"add_bias": false,
|
|
|
372 |
"attention_multiplier": 0.0078125,
|
373 |
"dropout": 0,
|
374 |
"num_key_value_heads": 4,
|
375 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
376 |
},
|
377 |
{
|
378 |
"add_bias": false,
|
|
|
380 |
"attention_multiplier": 0.0078125,
|
381 |
"dropout": 0,
|
382 |
"num_key_value_heads": 4,
|
383 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
384 |
},
|
385 |
{
|
386 |
"add_bias": false,
|
|
|
388 |
"attention_multiplier": 0.0078125,
|
389 |
"dropout": 0,
|
390 |
"num_key_value_heads": 4,
|
391 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
392 |
},
|
393 |
{
|
394 |
"add_bias": false,
|
|
|
396 |
"attention_multiplier": 0.0078125,
|
397 |
"dropout": 0,
|
398 |
"num_key_value_heads": 4,
|
399 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
400 |
},
|
401 |
{
|
402 |
"add_bias": false,
|
|
|
404 |
"attention_multiplier": 0.0078125,
|
405 |
"dropout": 0,
|
406 |
"num_key_value_heads": 4,
|
407 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
408 |
},
|
409 |
{
|
410 |
"add_bias": false,
|
|
|
412 |
"attention_multiplier": 0.0078125,
|
413 |
"dropout": 0,
|
414 |
"num_key_value_heads": 4,
|
415 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
416 |
},
|
417 |
{
|
418 |
"add_bias": false,
|
|
|
420 |
"attention_multiplier": 0.0078125,
|
421 |
"dropout": 0,
|
422 |
"num_key_value_heads": 4,
|
423 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
424 |
},
|
425 |
{
|
426 |
"add_bias": false,
|
|
|
428 |
"attention_multiplier": 0.0078125,
|
429 |
"dropout": 0,
|
430 |
"num_key_value_heads": 4,
|
431 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
432 |
},
|
433 |
{
|
434 |
"add_bias": false,
|
|
|
436 |
"attention_multiplier": 0.0078125,
|
437 |
"dropout": 0,
|
438 |
"num_key_value_heads": 4,
|
439 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
440 |
},
|
441 |
{
|
442 |
"add_bias": false,
|
|
|
444 |
"attention_multiplier": 0.0078125,
|
445 |
"dropout": 0,
|
446 |
"num_key_value_heads": 4,
|
447 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
448 |
},
|
449 |
{
|
450 |
"add_bias": false,
|
|
|
452 |
"attention_multiplier": 0.0078125,
|
453 |
"dropout": 0,
|
454 |
"num_key_value_heads": 4,
|
455 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
456 |
},
|
457 |
{
|
458 |
"add_bias": false,
|
|
|
460 |
"attention_multiplier": 0.0078125,
|
461 |
"dropout": 0,
|
462 |
"num_key_value_heads": 4,
|
463 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
464 |
},
|
465 |
{
|
466 |
"add_bias": false,
|
|
|
468 |
"attention_multiplier": 0.0078125,
|
469 |
"dropout": 0,
|
470 |
"num_key_value_heads": 4,
|
471 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
472 |
},
|
473 |
{
|
474 |
"add_bias": false,
|
|
|
476 |
"attention_multiplier": 0.0078125,
|
477 |
"dropout": 0,
|
478 |
"num_key_value_heads": 4,
|
479 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
480 |
},
|
481 |
{
|
482 |
"add_bias": false,
|
|
|
484 |
"attention_multiplier": 0.0078125,
|
485 |
"dropout": 0,
|
486 |
"num_key_value_heads": 4,
|
487 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
488 |
},
|
489 |
{
|
490 |
"add_bias": false,
|
|
|
492 |
"attention_multiplier": 0.0078125,
|
493 |
"dropout": 0,
|
494 |
"num_key_value_heads": 4,
|
495 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
496 |
},
|
497 |
{
|
498 |
"add_bias": false,
|
|
|
500 |
"attention_multiplier": 0.0078125,
|
501 |
"dropout": 0,
|
502 |
"num_key_value_heads": 4,
|
503 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
504 |
},
|
505 |
{
|
506 |
"add_bias": false,
|
|
|
508 |
"attention_multiplier": 0.0078125,
|
509 |
"dropout": 0,
|
510 |
"num_key_value_heads": 4,
|
511 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
512 |
},
|
513 |
{
|
514 |
"add_bias": false,
|
|
|
516 |
"attention_multiplier": 0.0078125,
|
517 |
"dropout": 0,
|
518 |
"num_key_value_heads": 4,
|
519 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
520 |
},
|
521 |
{
|
522 |
"add_bias": false,
|
|
|
524 |
"attention_multiplier": 0.0078125,
|
525 |
"dropout": 0,
|
526 |
"num_key_value_heads": 4,
|
527 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
528 |
},
|
529 |
{
|
530 |
"add_bias": false,
|
|
|
532 |
"attention_multiplier": 0.0078125,
|
533 |
"dropout": 0,
|
534 |
"num_key_value_heads": 4,
|
535 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
536 |
},
|
537 |
{
|
538 |
"add_bias": false,
|
|
|
540 |
"attention_multiplier": 0.0078125,
|
541 |
"dropout": 0,
|
542 |
"num_key_value_heads": 4,
|
543 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
544 |
},
|
545 |
{
|
546 |
"add_bias": false,
|
|
|
548 |
"attention_multiplier": 0.0078125,
|
549 |
"dropout": 0,
|
550 |
"num_key_value_heads": 4,
|
551 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
552 |
},
|
553 |
{
|
554 |
"add_bias": false,
|
|
|
556 |
"attention_multiplier": 0.0078125,
|
557 |
"dropout": 0,
|
558 |
"num_key_value_heads": 4,
|
559 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
560 |
},
|
561 |
{
|
562 |
"add_bias": false,
|
|
|
564 |
"attention_multiplier": 0.0078125,
|
565 |
"dropout": 0,
|
566 |
"num_key_value_heads": 4,
|
567 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
568 |
},
|
569 |
{
|
570 |
"add_bias": false,
|
|
|
572 |
"attention_multiplier": 0.0078125,
|
573 |
"dropout": 0,
|
574 |
"num_key_value_heads": 4,
|
575 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
576 |
},
|
577 |
{
|
578 |
"add_bias": false,
|
|
|
580 |
"attention_multiplier": 0.0078125,
|
581 |
"dropout": 0,
|
582 |
"num_key_value_heads": 4,
|
583 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
584 |
},
|
585 |
{
|
586 |
"add_bias": false,
|
|
|
588 |
"attention_multiplier": 0.0078125,
|
589 |
"dropout": 0,
|
590 |
"num_key_value_heads": 4,
|
591 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
592 |
},
|
593 |
{
|
594 |
"add_bias": false,
|
|
|
596 |
"attention_multiplier": 0.0078125,
|
597 |
"dropout": 0,
|
598 |
"num_key_value_heads": 4,
|
599 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
600 |
},
|
601 |
{
|
602 |
"add_bias": false,
|
|
|
604 |
"attention_multiplier": 0.0078125,
|
605 |
"dropout": 0,
|
606 |
"num_key_value_heads": 4,
|
607 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
608 |
},
|
609 |
{
|
610 |
"add_bias": false,
|
|
|
612 |
"attention_multiplier": 0.0078125,
|
613 |
"dropout": 0,
|
614 |
"num_key_value_heads": 4,
|
615 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
616 |
},
|
617 |
{
|
618 |
"add_bias": false,
|
|
|
620 |
"attention_multiplier": 0.0078125,
|
621 |
"dropout": 0,
|
622 |
"num_key_value_heads": 4,
|
623 |
+
"sequence_mixer_type": "stickbreaking_attention"
|
|
|
624 |
}
|
625 |
],
|
626 |
"transformers_version": "4.49.0.dev0",
|