Commit 
							
							·
						
						f5feb4c
	
1
								Parent(s):
							
							279cf39
								
Upload 5 files
Browse filesThe trained base model (2k batch size, 125k training steps) of Multi-perspective Course Learner (Pre-training Language Model as a Multi-perspective Course Learner, Findings of ACL 2023)
- checkpoint_1_125000.pt +3 -0
- dict.txt +0 -0
- get_json_file.py +9 -0
- sentencepiece.bpe.model +3 -0
- shard_data.py +22 -0
    	
        checkpoint_1_125000.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:704e1dfc96819022f6b52b0a43f0dade574d8f49dcdb756ed014b86407eca204
         | 
| 3 | 
            +
            size 2395347755
         | 
    	
        dict.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        get_json_file.py
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import glob
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
            from sys import argv
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            for split in ['train', 'valid']:
         | 
| 6 | 
            +
                with open(f'json/{split}.json', 'w') as f:
         | 
| 7 | 
            +
                    data = [{'source': glob.glob(f'shard/{split}/*'), 'source_lang': 'en', 'weight': 1.0, 'name': '16gb-en'}]
         | 
| 8 | 
            +
                    json.dump(data, f, indent=4)
         | 
| 9 | 
            +
             | 
    	
        sentencepiece.bpe.model
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:3a60b4d1d1d8f70c8b2569c94540d4d9b7c694fd32e7a428ad0dcffaafaa3beb
         | 
| 3 | 
            +
            size 1363614
         | 
    	
        shard_data.py
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from sys import argv
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            filename = argv[1]
         | 
| 4 | 
            +
            num_line = argv[2]
         | 
| 5 | 
            +
            output_dir = argv[3]
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            lines = open(filename).read().strip().split('\n')
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            ckpt = 0
         | 
| 10 | 
            +
            shard_lines = []
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            for i, line in enumerate(lines):
         | 
| 13 | 
            +
                if line == '' and (i-ckpt)>=int(num_line):
         | 
| 14 | 
            +
                    shard_lines.append(lines[ckpt:i+1])
         | 
| 15 | 
            +
                    ckpt = i+1
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            if ckpt < len(lines) - 1:
         | 
| 18 | 
            +
                shard_lines.append(lines[ckpt:])
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            for i, doc in enumerate(shard_lines):
         | 
| 21 | 
            +
                with open(f'{output_dir}/{i:06}.txt', 'w') as f:
         | 
| 22 | 
            +
                    print('\n'.join(doc), file=f, end='\n')
         |