Spaces:
Sleeping
Sleeping
Zai
commited on
Commit
·
bd78e12
1
Parent(s):
6feb6a4
Reset project for cleaner approach
Browse files- .github/workflows/hugging-face.yaml +0 -42
- .idea/.gitignore +8 -0
- .idea/burmese-gpt.iml +8 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +7 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- README.md +1 -2
- burmese_gpt/__init__.py +0 -0
- burmese_gpt/config.py +20 -0
- burmese_gpt/model.py +7 -0
- burmesegpt/__init__.py +0 -1
- burmesegpt/config.py +0 -9
- burmesegpt/core.py +0 -46
- burmesegpt/data_prep.py +0 -19
- burmesegpt/models.py +0 -24
- burmesegpt/tokenizer.py +0 -18
- burmesegpt/utils.py +0 -7
- sample.py +0 -10
- setup.py +5 -0
- tests/test_dataloader.py +0 -9
- tests/test_tokenizer.py +0 -15
- training.py +0 -7
.github/workflows/hugging-face.yaml
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
name: Uploading on Huggingface
|
2 |
-
on:
|
3 |
-
push:
|
4 |
-
branches: [main]
|
5 |
-
workflow_dispatch:
|
6 |
-
|
7 |
-
jobs:
|
8 |
-
sync-to-hub:
|
9 |
-
runs-on: ubuntu-latest
|
10 |
-
steps:
|
11 |
-
- uses: actions/checkout@v3
|
12 |
-
with:
|
13 |
-
fetch-depth: 0
|
14 |
-
lfs: true
|
15 |
-
- name: Set Git identity
|
16 |
-
run: |
|
17 |
-
git config --global user.email "[email protected]"
|
18 |
-
git config --global user.name "GitHub Actions"
|
19 |
-
|
20 |
-
- name: Update README.md
|
21 |
-
run: |
|
22 |
-
tmp_file=$(mktemp)
|
23 |
-
echo "---" >> $tmp_file
|
24 |
-
echo "title: Burmese GPT" >> $tmp_file
|
25 |
-
echo "emoji: 💫️" >> $tmp_file
|
26 |
-
echo "colorFrom: yellow" >> $tmp_file
|
27 |
-
echo "colorTo: blue" >> $tmp_file
|
28 |
-
echo "sdk: streamlit" >> $tmp_file
|
29 |
-
echo "sdk_version: 1.29.0" >> $tmp_file
|
30 |
-
echo "app_file: space.py" >> $tmp_file
|
31 |
-
echo "pinned: false" >> $tmp_file
|
32 |
-
echo "license: openrail" >> $tmp_file
|
33 |
-
echo "---" >> $tmp_file
|
34 |
-
echo "" >> $tmp_file
|
35 |
-
cat README.md >> $tmp_file
|
36 |
-
mv $tmp_file README.md
|
37 |
-
git add README.md
|
38 |
-
git commit -m "Updated README.md"
|
39 |
-
- name: Push to hub
|
40 |
-
env:
|
41 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
42 |
-
run: git push https://zaibutcooler:[email protected]/spaces/zaibutcooler/burmese-gpt --force main
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
.idea/burmese-gpt.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="jdk" jdkName="/opt/anaconda3" jdkType="Python SDK" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="/opt/anaconda3" />
|
5 |
+
</component>
|
6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="/opt/anaconda3" project-jdk-type="Python SDK" />
|
7 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/burmese-gpt.iml" filepath="$PROJECT_DIR$/.idea/burmese-gpt.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
README.md
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
|
2 |
-
# Burmese GPT
|
|
|
1 |
+
# Burmese GPT V2
|
|
burmese_gpt/__init__.py
ADDED
File without changes
|
burmese_gpt/config.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
@dataclass
|
4 |
+
class ModelConfig:
|
5 |
+
vocab_size: int = 30000
|
6 |
+
embed_dim: int = 256
|
7 |
+
num_heads: int = 8
|
8 |
+
num_layers: int = 6
|
9 |
+
dropout: float = 0.1
|
10 |
+
max_seq_len: int = 512
|
11 |
+
|
12 |
+
@dataclass
|
13 |
+
class TrainingConfig:
|
14 |
+
batch_size: int = 32
|
15 |
+
learning_rate: float = 5e-5
|
16 |
+
num_epochs: int = 10
|
17 |
+
warmup_steps: int = 1000
|
18 |
+
weight_decay: float = 0.01
|
19 |
+
checkpoint_dir: str = "checkpoints"
|
20 |
+
log_dir: str = "logs"
|
burmese_gpt/model.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
class BurmeseGPT(nn.Module):
|
5 |
+
def __init__(self):
|
6 |
+
super(BurmeseGPT, self).__init__()
|
7 |
+
pass
|
burmesegpt/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from .core import BurmeseGpt
|
|
|
|
burmesegpt/config.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
|
3 |
-
class Config:
|
4 |
-
def __init__(self) -> None:
|
5 |
-
pass
|
6 |
-
|
7 |
-
config = Config()
|
8 |
-
|
9 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
burmesegpt/core.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from huggingface_hub import login
|
3 |
-
|
4 |
-
from .models import SelfAttention,MLP,GPT
|
5 |
-
from .tokenizer import Tokenizer
|
6 |
-
from .config import Config
|
7 |
-
from .data_prep import Data
|
8 |
-
|
9 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
10 |
-
|
11 |
-
tokenizer = Tokenizer()
|
12 |
-
|
13 |
-
class BurmeseGpt:
|
14 |
-
def __init__(self,tk=tokenizer):
|
15 |
-
self.attention = SelfAttention().to(device)
|
16 |
-
self.mlp = MLP().to(device)
|
17 |
-
self.config = Config().to(device)
|
18 |
-
self.train_data = Data().to(device)
|
19 |
-
self.tokenizer = tk
|
20 |
-
|
21 |
-
def save_pretrained(self, name="burmese-gpt"):
|
22 |
-
print("Uploading model...")
|
23 |
-
self.model.save_pretrained(name)
|
24 |
-
print(f"Model saved locally as '{name}'")
|
25 |
-
self.model.push_to_hub(name)
|
26 |
-
print(f"Model '{name}' uploaded to the Hugging Face Model Hub")
|
27 |
-
|
28 |
-
def load_pretrained(self, model_id="zaibutcooler/burmese-gpt"):
|
29 |
-
print("Loading model...")
|
30 |
-
model = model.from_pretrained(model_id)
|
31 |
-
print(f"Model '{model_id}' loaded successfully")
|
32 |
-
return model
|
33 |
-
|
34 |
-
def huggingface_login(self,token):
|
35 |
-
login(token)
|
36 |
-
|
37 |
-
def pretrain(self):
|
38 |
-
pass
|
39 |
-
|
40 |
-
def fine_tune(self):
|
41 |
-
pass
|
42 |
-
|
43 |
-
def generate(self,entry=''):
|
44 |
-
result = None
|
45 |
-
|
46 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
burmesegpt/data_prep.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
# to preps data
|
2 |
-
from datasets import load_dataset
|
3 |
-
from torch.utils.data import Dataset
|
4 |
-
|
5 |
-
|
6 |
-
class Data(Dataset):
|
7 |
-
def __init__(self):
|
8 |
-
super().__init__()
|
9 |
-
self.texts = None
|
10 |
-
self.data
|
11 |
-
|
12 |
-
def __len__(self):
|
13 |
-
return None
|
14 |
-
|
15 |
-
def __getitem__(self, index):
|
16 |
-
return self.trainset[index]
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
burmesegpt/models.py
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
from huggingface_hub import PyTorchModelHubMixin
|
4 |
-
|
5 |
-
class SelfAttention(nn.Module,PyTorchModelHubMixin):
|
6 |
-
def __init__(self):
|
7 |
-
pass
|
8 |
-
|
9 |
-
def forward(self):
|
10 |
-
pass
|
11 |
-
|
12 |
-
class MLP(nn.Module,PyTorchModelHubMixin):
|
13 |
-
def __init__(self):
|
14 |
-
pass
|
15 |
-
|
16 |
-
def forward(self):
|
17 |
-
pass
|
18 |
-
|
19 |
-
class GPT(nn.Module,PyTorchModelHubMixin):
|
20 |
-
def __init__(self):
|
21 |
-
super().__init__()
|
22 |
-
|
23 |
-
def forward(self):
|
24 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
burmesegpt/tokenizer.py
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
class Tokenizer:
|
2 |
-
def __init__(self):
|
3 |
-
pass
|
4 |
-
|
5 |
-
def train(self,text,num_worker,verbose=False):
|
6 |
-
pass
|
7 |
-
|
8 |
-
def encoder(self,text):
|
9 |
-
pass
|
10 |
-
|
11 |
-
def decoder(self,ids):
|
12 |
-
pass
|
13 |
-
|
14 |
-
def save(self):
|
15 |
-
pass
|
16 |
-
|
17 |
-
def load(self):
|
18 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
burmesegpt/utils.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
# utils
|
2 |
-
|
3 |
-
def get_stats():
|
4 |
-
pass
|
5 |
-
|
6 |
-
def merge():
|
7 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
# sample the texts
|
2 |
-
from burmesegpt import BurmeseGpt
|
3 |
-
|
4 |
-
gpt = BurmeseGpt()
|
5 |
-
|
6 |
-
model_name = ''
|
7 |
-
|
8 |
-
gpt.load_pretrained(model_name)
|
9 |
-
|
10 |
-
gpt.sample()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name="burmese_gpt",
|
5 |
+
)
|
tests/test_dataloader.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
from ..burmesegpt import Data
|
2 |
-
|
3 |
-
|
4 |
-
def check_loader():
|
5 |
-
pass
|
6 |
-
|
7 |
-
|
8 |
-
def check_output_dim():
|
9 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_tokenizer.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
import unittest
|
2 |
-
import torch
|
3 |
-
|
4 |
-
|
5 |
-
class TestTokenizer(unittest.TestCase):
|
6 |
-
def test_downloading(self):
|
7 |
-
url = ""
|
8 |
-
pass
|
9 |
-
|
10 |
-
def test_loading(self):
|
11 |
-
pass
|
12 |
-
|
13 |
-
|
14 |
-
if __name__ == "__main__":
|
15 |
-
unittest.main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
from burmesegpt import BurmeseGpt
|
2 |
-
|
3 |
-
gpt = BurmeseGpt()
|
4 |
-
|
5 |
-
gpt.train()
|
6 |
-
|
7 |
-
out_dir = 'out'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|