diff --git "a/train.ipynb" "b/train.ipynb" new file mode 100644--- /dev/null +++ "b/train.ipynb" @@ -0,0 +1,1110 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "id": "452ee8e2-7d0f-4a52-97e6-7976bb21e4c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.9.2 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorboard 2.9.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q -U transformers huggingface_hub datasets accelerate peft onnx onnxruntime optimum\n", + "# !pip install -q transformers==4.28.0 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7cabae93-a56e-48e6-bf6e-36f65b15700b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected operating system as Ubuntu/focal.\n", + "Checking for curl...\n", + "Detected curl...\n", + "Checking for gpg...\n", + "Detected gpg...\n", + "Detected apt version as 2.0.9\n", + "Running apt-get update... done.\n", + "Installing apt-transport-https... done.\n", + "Installing /etc/apt/sources.list.d/github_git-lfs.list...done.\n", + "Importing packagecloud gpg key... Packagecloud gpg key imported to /etc/apt/keyrings/github_git-lfs-archive-keyring.gpg\n", + "done.\n", + "Running apt-get update... done.\n", + "\n", + "The repository is setup! You can now install packages.\n" + ] + } + ], + "source": [ + "!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6428ceeb-9fd1-455d-9cb7-f4d4102d6d34", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following NEW packages will be installed:\n", + " git-lfs\n", + "0 upgraded, 1 newly installed, 0 to remove and 111 not upgraded.\n", + "Need to get 7419 kB of archives.\n", + "After this operation, 16.0 MB of additional disk space will be used.\n", + "Get:1 https://packagecloud.io/github/git-lfs/ubuntu focal/main amd64 git-lfs amd64 3.3.0 [7419 kB]\n", + "Fetched 7419 kB in 1s (11.9 MB/s)\n", + "Selecting previously unselected package git-lfs.\n", + "(Reading database ... 69943 files and directories currently installed.)\n", + "Preparing to unpack .../git-lfs_3.3.0_amd64.deb ...\n", + "Unpacking git-lfs (3.3.0) ...\n", + "Setting up git-lfs (3.3.0) ...\n", + "Git LFS initialized.\n", + "Processing triggers for man-db (2.9.1-1) ...\n" + ] + } + ], + "source": [ + "!sudo apt-get install git-lfs" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d522266b-71f2-4e92-a187-dfb8e497bd0d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a509e74171a046dea452a1c3281c17d2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
Step | \n", + "Training Loss | \n", + "Validation Loss | \n", + "
---|---|---|
1550 | \n", + "0.128300 | \n", + "0.602472 | \n", + "
1600 | \n", + "0.125100 | \n", + "0.629285 | \n", + "
1650 | \n", + "0.115500 | \n", + "0.609703 | \n", + "
1700 | \n", + "0.123100 | \n", + "0.627464 | \n", + "
1750 | \n", + "0.097100 | \n", + "0.610019 | \n", + "
1800 | \n", + "0.104800 | \n", + "0.604064 | \n", + "
1850 | \n", + "0.087500 | \n", + "0.612238 | \n", + "
1900 | \n", + "0.104200 | \n", + "0.613567 | \n", + "
1950 | \n", + "0.097600 | \n", + "0.612580 | \n", + "
2000 | \n", + "0.069200 | \n", + "0.610577 | \n", + "
"
+ ],
+ "text/plain": [
+ "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ /usr/local/lib/python3.9/dist-packages/huggingface_hub/repository.py:1032 in git_commit │\n",
+ "│ │\n",
+ "│ 1029 │ │ │ │ The message attributed to the commit. │\n",
+ "│ 1030 │ │ \"\"\" │\n",
+ "│ 1031 │ │ try: │\n",
+ "│ ❱ 1032 │ │ │ result = run_subprocess(\"git commit -v -m\".split() + [commit_message], self. │\n",
+ "│ 1033 │ │ │ logger.info(f\"Committed:\\n{result.stdout}\\n\") │\n",
+ "│ 1034 │ │ except subprocess.CalledProcessError as exc: │\n",
+ "│ 1035 │ │ │ if len(exc.stderr) > 0: │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/_subprocess.py:83 in run_subprocess │\n",
+ "│ │\n",
+ "│ 80 │ if isinstance(folder, Path): │\n",
+ "│ 81 │ │ folder = str(folder) │\n",
+ "│ 82 │ │\n",
+ "│ ❱ 83 │ return subprocess.run( │\n",
+ "│ 84 │ │ command, │\n",
+ "│ 85 │ │ stderr=subprocess.PIPE, │\n",
+ "│ 86 │ │ stdout=subprocess.PIPE, │\n",
+ "│ │\n",
+ "│ /usr/lib/python3.9/subprocess.py:528 in run │\n",
+ "│ │\n",
+ "│ 525 │ │ │ raise │\n",
+ "│ 526 │ │ retcode = process.poll() │\n",
+ "│ 527 │ │ if check and retcode: │\n",
+ "│ ❱ 528 │ │ │ raise CalledProcessError(retcode, process.args, │\n",
+ "│ 529 │ │ │ │ │ │ │ │ │ output=stdout, stderr=stderr) │\n",
+ "│ 530 │ return CompletedProcess(process.args, retcode, stdout, stderr) │\n",
+ "│ 531 │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "CalledProcessError: Command '['git', 'commit', '-v', '-m', 'Training in progress, step 2000']' returned non-zero \n",
+ "exit status 128.\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ in <module> │\n",
+ "│ │\n",
+ "│ ❱ 1 trainer.train(\"ingbetic/checkpoint-1500\") │\n",
+ "│ 2 │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:1664 in train │\n",
+ "│ │\n",
+ "│ 1661 │ │ inner_training_loop = find_executable_batch_size( │\n",
+ "│ 1662 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │\n",
+ "│ 1663 │ │ ) │\n",
+ "│ ❱ 1664 │ │ return inner_training_loop( │\n",
+ "│ 1665 │ │ │ args=args, │\n",
+ "│ 1666 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │\n",
+ "│ 1667 │ │ │ trial=trial, │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:2019 in _inner_training_loop │\n",
+ "│ │\n",
+ "│ 2016 │ │ │ │ │ self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epo │\n",
+ "│ 2017 │ │ │ │ │ self.control = self.callback_handler.on_step_end(args, self.state, s │\n",
+ "│ 2018 │ │ │ │ │ │\n",
+ "│ ❱ 2019 │ │ │ │ │ self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_k │\n",
+ "│ 2020 │ │ │ │ else: │\n",
+ "│ 2021 │ │ │ │ │ self.control = self.callback_handler.on_substep_end(args, self.state │\n",
+ "│ 2022 │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:2308 in _maybe_log_save_evaluate │\n",
+ "│ │\n",
+ "│ 2305 │ │ │ │ self.lr_scheduler.step(metrics[self.args.metric_for_best_model]) │\n",
+ "│ 2306 │ │ │\n",
+ "│ 2307 │ │ if self.control.should_save: │\n",
+ "│ ❱ 2308 │ │ │ self._save_checkpoint(model, trial, metrics=metrics) │\n",
+ "│ 2309 │ │ │ self.control = self.callback_handler.on_save(self.args, self.state, self.con │\n",
+ "│ 2310 │ │\n",
+ "│ 2311 │ def _load_rng_state(self, checkpoint): │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:2462 in _save_checkpoint │\n",
+ "│ │\n",
+ "│ 2459 │ │ │ torch.save(rng_states, os.path.join(output_dir, f\"rng_state_{self.args.proce │\n",
+ "│ 2460 │ │ │\n",
+ "│ 2461 │ │ if self.args.push_to_hub: │\n",
+ "│ ❱ 2462 │ │ │ self._push_from_checkpoint(output_dir) │\n",
+ "│ 2463 │ │ │\n",
+ "│ 2464 │ │ # Maybe delete some older checkpoints. │\n",
+ "│ 2465 │ │ if self.args.should_save: │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:3649 in _push_from_checkpoint │\n",
+ "│ │\n",
+ "│ 3646 │ │ │ │ commit_message = f\"Training in progress, step {self.state.global_step}\" │\n",
+ "│ 3647 │ │ │ else: │\n",
+ "│ 3648 │ │ │ │ commit_message = f\"Training in progress, epoch {int(self.state.epoch)}\" │\n",
+ "│ ❱ 3649 │ │ │ _, self.push_in_progress = self.repo.push_to_hub( │\n",
+ "│ 3650 │ │ │ │ commit_message=commit_message, blocking=False, auto_lfs_prune=True │\n",
+ "│ 3651 │ │ │ ) │\n",
+ "│ 3652 │ │ finally: │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/huggingface_hub/repository.py:1306 in push_to_hub │\n",
+ "│ │\n",
+ "│ 1303 │ │ │ logger.info(\"Repo currently clean. Ignoring push_to_hub\") │\n",
+ "│ 1304 │ │ │ return None │\n",
+ "│ 1305 │ │ self.git_add(auto_lfs_track=True) │\n",
+ "│ ❱ 1306 │ │ self.git_commit(commit_message) │\n",
+ "│ 1307 │ │ return self.git_push( │\n",
+ "│ 1308 │ │ │ upstream=f\"origin {self.current_branch}\", │\n",
+ "│ 1309 │ │ │ blocking=blocking, │\n",
+ "│ │\n",
+ "│ /usr/local/lib/python3.9/dist-packages/huggingface_hub/repository.py:1036 in git_commit │\n",
+ "│ │\n",
+ "│ 1033 │ │ │ logger.info(f\"Committed:\\n{result.stdout}\\n\") │\n",
+ "│ 1034 │ │ except subprocess.CalledProcessError as exc: │\n",
+ "│ 1035 │ │ │ if len(exc.stderr) > 0: │\n",
+ "│ ❱ 1036 │ │ │ │ raise EnvironmentError(exc.stderr) │\n",
+ "│ 1037 │ │ │ else: │\n",
+ "│ 1038 │ │ │ │ raise EnvironmentError(exc.stdout) │\n",
+ "│ 1039 │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "OSError: fatal: Unable to create '/notebooks/ingbetic/.git/index.lock': File exists.\n",
+ "\n",
+ "Another git process seems to be running in this repository, e.g.\n",
+ "an editor opened by 'git commit'. Please make sure all processes\n",
+ "are terminated then try again. If it still fails, a git process\n",
+ "may have crashed in this repository earlier:\n",
+ "remove the file manually to continue.\n",
+ "\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/\u001b[0m\u001b[1;33mrepository.py\u001b[0m:\u001b[94m1032\u001b[0m in \u001b[92mgit_commit\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1029 \u001b[0m\u001b[2;33m│ │ │ │ \u001b[0m\u001b[33mThe message attributed to the commit.\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1030 \u001b[0m\u001b[2;33m│ │ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1031 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1032 \u001b[2m│ │ │ \u001b[0mresult = run_subprocess(\u001b[33m\"\u001b[0m\u001b[33mgit commit -v -m\u001b[0m\u001b[33m\"\u001b[0m.split() + [commit_message], \u001b[96mself\u001b[0m. \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1033 \u001b[0m\u001b[2m│ │ │ \u001b[0mlogger.info(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mCommitted:\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33m{\u001b[0mresult.stdout\u001b[33m}\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1034 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mexcept\u001b[0m subprocess.CalledProcessError \u001b[94mas\u001b[0m exc: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1035 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mlen\u001b[0m(exc.stderr) > \u001b[94m0\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/\u001b[0m\u001b[1;33m_subprocess.py\u001b[0m:\u001b[94m83\u001b[0m in \u001b[92mrun_subprocess\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 80 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mif\u001b[0m \u001b[96misinstance\u001b[0m(folder, Path): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 81 \u001b[0m\u001b[2m│ │ \u001b[0mfolder = \u001b[96mstr\u001b[0m(folder) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 82 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 83 \u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m subprocess.run( \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 84 \u001b[0m\u001b[2m│ │ \u001b[0mcommand, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 85 \u001b[0m\u001b[2m│ │ \u001b[0mstderr=subprocess.PIPE, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 86 \u001b[0m\u001b[2m│ │ \u001b[0mstdout=subprocess.PIPE, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/usr/lib/python3.9/\u001b[0m\u001b[1;33msubprocess.py\u001b[0m:\u001b[94m528\u001b[0m in \u001b[92mrun\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 525 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 526 \u001b[0m\u001b[2m│ │ \u001b[0mretcode = process.poll() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 527 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m check \u001b[95mand\u001b[0m retcode: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 528 \u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m CalledProcessError(retcode, process.args, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 529 \u001b[0m\u001b[2m│ │ │ │ │ │ │ │ │ \u001b[0moutput=stdout, stderr=stderr) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 530 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m CompletedProcess(process.args, retcode, stdout, stderr) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 531 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+ "\u001b[1;91mCalledProcessError: \u001b[0mCommand \u001b[32m'\u001b[0m\u001b[32m[\u001b[0m\u001b[32m'\u001b[0mgit', \u001b[32m'commit'\u001b[0m, \u001b[32m'-v'\u001b[0m, \u001b[32m'-m'\u001b[0m, \u001b[32m'Training in progress, step 2000'\u001b[0m\u001b[1m]\u001b[0m' returned non-zero \n",
+ "exit status \u001b[1;36m128\u001b[0m.\n",
+ "\n",
+ "\u001b[3mDuring handling of the above exception, another exception occurred:\u001b[0m\n",
+ "\n",
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+ "\u001b[31m│\u001b[0m in \u001b[92m