mkshing commited on Aug 16, 2023

Commit

a66edf1

0 Parent(s):

initial commit

Browse files

Files changed (20) hide show

.gitattributes +35 -0
.gitignore +2 -0
LICENSE +60 -0
README.md +192 -0
config.json +255 -0
configuration_japanese_instructblip_alpha.py +57 -0
configuration_japanese_stablelm_alpha.py +120 -0
japanese-instructblip-parrot.png +0 -0
modeling_japanese_instructblip_alpha.py +62 -0
modeling_japanese_stablelm_alpha.py +682 -0
preprocessor_config.json +24 -0
pytorch_model-00001-of-00004.bin +3 -0
pytorch_model-00002-of-00004.bin +3 -0
pytorch_model-00003-of-00004.bin +3 -0
pytorch_model-00004-of-00004.bin +3 -0
pytorch_model.bin.index.fp16.json +0 -0
pytorch_model.bin.index.json +0 -0
pytorch_model.fp16-00001-of-00002.bin +3 -0
pytorch_model.fp16-00002-of-00002.bin +3 -0
requirements.txt +2 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ test.py

LICENSE ADDED Viewed

	@@ -0,0 +1,60 @@

+JAPANESE STABLELM RESEARCH LICENSE AGREEMENT
+Dated: August 7, 2023
+"Agreement" means the terms and conditions for use, reproduction, distribution and modification of the Software Products set forth herein.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software.
+"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person’s or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+"Stability AI" or "we" means Stability AI Ltd.
+"Software" means, collectively, Stability AI’s proprietary Japanese StableLM made available under this Agreement.
+“Software Products” means Software and Documentation.
+By using or distributing any portion or element of the Software Products, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+    a. Subject to your compliance with this Agreement and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Software Products to reproduce, distribute, and create derivative works of the Software Products for purposes other than commercial or production use.
+    b. You will not, and will not permit, assist or cause any third party to use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for any commercial or production purposes.
+    c. If you distribute or make the Software Products, or any derivative works thereof, available to a third party, you shall (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "Japanese StableLM is licensed under the Japanese StableLM Research License, Copyright (c) Stability AI Ltd. All Rights Reserved.”
+    d. The licenses granted to you under this Agreement are conditioned upon your compliance with the Documentation and this Agreement, including the Acceptable Use Policy below and as may be updated from time to time in the future on stability.ai, which is hereby incorporated by reference into this Agreement.
+2. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS  AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS.
+3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+4. Intellectual Property.
+    a. No trademark licenses are granted under this Agreement, and in connection with the Software Products, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products.
+    b. Subject to Stability AI’s ownership of the Software Products and derivatives made by or for Stability AI, with respect to any derivative works and modifications of the Software Products that are made by you, as between you and Stability AI, you are and will be the owner of such derivative works and modifications.
+    c. If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products in violation of this Agreement.
+5. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Software Products. Sections 2-4 shall survive the termination of this Agreement.
+—----------
+Japanese StableLM Acceptable Use Policy
+If you access, use, or distribute any Stability AI models, software, or other materials (“Stability Technology”) you agree to this Acceptable Use Policy (“Policy”).
+We want everyone to use Stability Technology safely and responsibly. You agree you will not use, or allow others to use, Stability Technology to:
+1. To violate the law or others’ rights (including intellectual property rights and the rights of data privacy and protection), nor will you promote, contribute to, encourage, facilitate, plan, incite, or further anyone else’s violation of the law or others’ rights;
+2. To commit, promote, contribute to, facilitate, encourage, plan, incite, or further any of the following:
+    a. Violence or terrorism;
+    b. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content;
+    c. Human trafficking, exploitation, and sexual violence;
+    d. Harassment, abuse, threatening, stalking, or bullying of individuals or groups of individuals;
+    e. Discrimination in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services on the basis of race, color, caste, religion, sex (including pregnancy, sexual orientation, or gender identity), national origin, age, disability, or genetic information (including family medical history) except as may be required by applicable law (such as the provision of social security benefits solely to people who meet certain age requirements under the law);
+    f. Creation of malicious code, malware, computer viruses or any activity that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system;
+3. For purposes of or for the performance of:
+    a. Fully automated decision-making, including profiling, with respect to an individual or group of individuals which produces legal effects concerning such individual(s) or similarly significantly affects such individual(s);
+    b. Systematic or automated scraping, mining, extraction, or harvesting of personally identifiable data, or similar activity, from the output of any Stability Technology except with respect to data that you have provided as input to the Stability Technology and which you are legally entitled to process, for so long as you retain such entitlement;
+    c. Development, improvement, or manufacture of any weapons of mass destruction (such as nuclear, chemical, or biologic weapons), weapons of war (such as missiles or landmines), or any gain of function-related activities with respect to any pathogens;
+    d. Mission critical applications or systems where best industry practices require fail-safe controls or performance, including operation of nuclear facilities, aircraft navigation, electrical grids, communication systems, water treatment facilities, air traffic control, life support, weapons systems, or emergency locator or other emergency services;
+4. To intentionally deceive or mislead others, including use of Japanese StableLM related to the following:
+    i. Generating, promoting, or furthering fraud or the creation or promotion of disinformation;
+    ii. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content;
+    iii. Generating, promoting, or further distributing spam;
+    iv. Impersonating another individual without consent, authorization, or legal right
+    v. Representing or misleading people into believing that the use of Japanese StableLM or outputs are human-generated;
+    vi. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement;
+    vii. Generating or facilitating large-scale political advertisements, propaganda, or influence campaigns;
+5. Fail to appropriately disclose to end users any known dangers of your AI system or misrepresent or mislead with respect to its abilities.
+Nothing in this AUP is intended to prevent or impede any good faith research, testing, or evaluation of Japanese StableLM, or publication related to any of the foregoing. If you discover any flaws in Japanese StableLM that may be harmful to people in any way, we encourage you to notify us and give us a chance to remedy such flaws before others can exploit them. If you have questions about this AUP, contact us at [email protected].

README.md ADDED Viewed

	@@ -0,0 +1,192 @@

+---
+language:
+- ja
+tags:
+- instructblip
+- vision
+- image-captioning
+- japanese-stablelm
+pipeline_tag: image-to-text
+license:
+- other
+extra_gated_heading: Access Japanese StableLM Instruct Alpha
+extra_gated_description: This repository is publicly accessible, but you have to accept the conditions to access its files and content.
+extra_gated_button_content: Access repository
+extra_gated_fields:
+  Name: text
+  Email: text
+  Organization: text
+  I agree to accept the conditions and share above info with Stability AI: checkbox
+extra_gated_prompt: |
+    ### JAPANESE STABLELM RESEARCH LICENSE AGREEMENT
+    Dated: August 7, 2023
+    "Agreement" means the terms and conditions for use, reproduction, distribution and modification of the Software Products set forth herein.
+    “Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software.
+    "Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person’s or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+    "Stability AI" or "we" means Stability AI Ltd.
+    "Software" means, collectively, Stability AI’s proprietary Japanese StableLM made available under this Agreement.
+    “Software Products” means Software and Documentation.
+    By using or distributing any portion or element of the Software Products, you agree to be bound by this Agreement.
+    - License Rights and Redistribution.
+        - Subject to your compliance with this Agreement and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Software Products to reproduce, distribute, and create derivative works of the Software Products for purposes other than commercial or production use.
+        - You will not, and will not permit, assist or cause any third party to use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for any commercial or production purposes.
+        - If you distribute or make the Software Products, or any derivative works thereof, available to a third party, you shall (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "Japanese StableLM is licensed under the Japanese StableLM Research License, Copyright (c) Stability AI Ltd. All Rights Reserved.”
+        - The licenses granted to you under this Agreement are conditioned upon your compliance with the Documentation and this Agreement, including the Acceptable Use Policy below and as may be updated from time to time in the future on stability.ai, which is hereby incorporated by reference into this Agreement.
+    - Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS  AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS.
+    - Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+    - Intellectual Property.
+        - No trademark licenses are granted under this Agreement, and in connection with the Software Products, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products.
+        - Subject to Stability AI’s ownership of the Software Products and derivatives made by or for Stability AI, with respect to any derivative works and modifications of the Software Products that are made by you, as between you and Stability AI, you are and will be the owner of such derivative works and modifications.
+        - If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products in violation of this Agreement.
+    - Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Software Products. Sections 2-4 shall survive the termination of this Agreement.
+    —----------
+    ### Japanese StableLM Acceptable Use Policy
+    If you access, use, or distribute any Stability AI models, software, or other materials (“Stability Technology”) you agree to this Acceptable Use Policy (“Policy”).
+    We want everyone to use Stability Technology safely and responsibly. You agree you will not use, or allow others to use, Stability Technology to:
+    - To violate the law or others’ rights (including intellectual property rights and the rights of data privacy and protection), nor will you promote, contribute to, encourage, facilitate, plan, incite, or further anyone else’s violation of the law or others’ rights;
+    - To commit, promote, contribute to, facilitate, encourage, plan, incite, or further any of the following:
+            - Violence or terrorism;
+            - Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content;
+            - Human trafficking, exploitation, and sexual violence;
+            - Harassment, abuse, threatening, stalking, or bullying of individuals or groups of individuals;
+            - Discrimination in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services on the basis of race, color, caste, religion, sex (including pregnancy, sexual orientation, or gender identity), national origin, age, disability, or genetic information (including family medical history) except as may be required by applicable law (such as the provision of social security benefits solely to people who meet certain age requirements under the law);
+            - Creation of malicious code, malware, computer viruses or any activity that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system;
+    - For purposes of or for the performance of:
+            - Fully automated decision-making, including profiling, with respect to an individual or group of individuals which produces legal effects concerning such individual(s) or similarly significantly affects such individual(s);
+            - Systematic or automated scraping, mining, extraction, or harvesting of personally identifiable data, or similar activity, from the output of any Stability Technology except with respect to data that you have provided as input to the Stability Technology and which you are legally entitled to process, for so long as you retain such entitlement;
+            - Development, improvement, or manufacture of any weapons of mass destruction (such as nuclear, chemical, or biologic weapons), weapons of war (such as missiles or landmines), or any gain of function-related activities with respect to any pathogens;
+            - Mission critical applications or systems where best industry practices require fail-safe controls or performance, including operation of nuclear facilities, aircraft navigation, electrical grids, communication systems, water treatment facilities, air traffic control, life support, weapons systems, or emergency locator or other emergency services;
+    - To intentionally deceive or mislead others, including use of Japanese StableLM related to the following:
+        - Generating, promoting, or furthering fraud or the creation or promotion of disinformation;
+        - Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content;
+        - Generating, promoting, or further distributing spam;
+        - Impersonating another individual without consent, authorization, or legal right
+        - Representing or misleading people into believing that the use of Japanese StableLM or outputs are human-generated;
+        - Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement;
+        - Generating or facilitating large-scale political advertisements, propaganda, or influence campaigns;
+    - Fail to appropriately disclose to end users any known dangers of your AI system or misrepresent or mislead with respect to its abilities.
+    Nothing in this AUP is intended to prevent or impede any good faith research, testing, or evaluation of Japanese StableLM, or publication related to any of the foregoing. If you discover any flaws in Japanese StableLM that may be harmful to people in any way, we encourage you to notify us and give us a chance to remedy such flaws before others can exploit them. If you have questions about this AUP, contact us at [email protected].
+---
+# Japanese InstructBLIP Alpha
+![japanese-instructblip-icon](./japanese-instructblip-parrot.png)
+## Model Details
+Japanese InstructBLIP Alpha is a vision-language instruction-following model that enables to generate Japanese descriptions for input images and optionally input texts such as questions.
+## Usage
+First install additional dependencies in [requirements.txt](./requirements.txt):
+```sh
+pip install sentencepiece einops
+```
+```python
+import torch
+from transformers import LlamaTokenizer, AutoModelForVision2Seq, BlipImageProcessor
+from PIL import Image
+import requests
+# helper function to format input prompts
+def build_prompt(prompt="", sep="\n\n### "):
+    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
+    p = sys_msg
+    roles = ["指示", "応答"]
+    user_query = "与えられた画像について、詳細に述べてください。"
+    msgs = [": \n" + user_query, ": "]
+    if prompt:
+        roles.insert(1, "入力")
+        msgs.insert(1, ": \n" + prompt)
+    for role, msg in zip(roles, msgs):
+        p += sep + role + msg
+    return p
+# load model
+model = AutoModelForVision2Seq.from_pretrained("stabilityai/japanese-instructblip-alpha", trust_remote_code=True)
+processor = BlipImageProcessor.from_pretrained("stabilityai/japanese-instructblip-alpha")
+tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1", additional_special_tokens=['▁▁'])
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+# prepare inputs
+url = "https://images.unsplash.com/photo-1582538885592-e70a5d7ab3d3?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1770&q=80"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+prompt = "" # input empty string for image captioning. You can also input questions as prompts
+prompt = build_prompt(prompt)
+inputs = processor(images=image, return_tensors="pt")
+text_encoding = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
+text_encoding["qformer_input_ids"] = text_encoding["input_ids"].clone()
+text_encoding["qformer_attention_mask"] = text_encoding["attention_mask"].clone()
+inputs.update(text_encoding)
+# generate
+outputs = model.generate(
+    **inputs.to(device, dtype=model.dtype),
+    num_beams=5,
+    max_new_tokens=32,
+    min_length=1,
+)
+generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+print(generated_text)
+# 桜と東京スカイツリー
+```
+## Model Details
+* **Developed by**: [Stability AI](https://stability.ai/)
+* **Model type**: [InstructBLIP](https://arxiv.org/abs/2305.06500)
+* **Language(s)**: Japanese
+* **License**: [JAPANESE STABLELM RESEARCH LICENSE AGREEMENT](./LICENSE).
+### Training
+Japanese InstructBLIP Alpha leverages the [InstructBLIP](https://arxiv.org/abs/2305.06500) architecture. It consists of 3 components: a frozen vision image encoder, a Q-Former, and a frozen LLM. The vision encoder and the Q-Former were initialized with [Salesforce/instructblip-vicuna-7b](https://huggingface.co/Salesforce/instructblip-vicuna-7b). For the frozen LLM, [Japanese-StableLM-Instruct-Alpha-7B](https://huggingface.co/stabilityai/japanese-stablelm-instruct-alpha-7b) model was used. During training, only Q-Former was trained.
+### Training Dataset
+The training dataset includes the following public datasets:
+- [CC12M](https://github.com/google-research-datasets/conceptual-12m) with captions translated into Japanese
+- [MS-COCO](https://cocodataset.org/#home) with [STAIR Captions](http://captions.stair.center/)
+- [Japanese Visual Genome VQA dataset](https://github.com/yahoojapan/ja-vg-vqa)
+## Use and Limitations
+### Intended Use
+This model is intended to be used by the open-source community in chat-like applications in adherence with the research license.
+### Limitations and bias
+Although the aforementioned datasets help to steer the base language models into "safer" distributions of text, not all biases and toxicity can be mitigated through fine-tuning. We ask that users be mindful of such potential issues that can arise in generated responses. Do not treat model outputs as substitutes for human judgment or as sources of truth. Please use responsibly.
+## How to cite
+```bibtex
+@misc{JapaneseInstructBLIPAlpha,
+    url    = {[https://huggingface.co/stabilityai/japanese-instructblip-alpha](https://huggingface.co/stabilityai/japanese-instructblip-alpha)},
+    title  = {Japanese InstructBLIP Alpha},
+    author = {Shing, Makoto and Akiba, Takuya}
+}
+```
+## Citations
+```bibtex
+@misc{dai2023instructblip,
+    title         = {InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning},
+    author        = {Wenliang Dai and Junnan Li and Dongxu Li and Anthony Meng Huat Tiong and Junqi Zhao and Weisheng Wang and Boyang Li and Pascale Fung and Steven Hoi},
+    year          = {2023},
+    eprint        = {2305.06500},
+    archivePrefix = {arXiv},
+    primaryClass  = {cs.CV}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+  "_name_or_path": "stabilityai/japanese-instructblip-alpha",
+  "architectures": [
+    "JapaneseInstructBlipAlphaForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoModelForVision2Seq": "modeling_japanese_instructblip_alpha.JapaneseInstructBlipAlphaForConditionalGeneration",
+    "AutoConfig": "configuration_japanese_instructblip_alpha.JapaneseInstructBlipAlphaConfig"
+  },
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "model_type": "instructblip",
+  "num_query_tokens": 32,
+  "qformer_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_frequency": 2,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_hidden_size": 1408,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "instructblip_qformer",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 65535
+  },
+  "text_config": {
+    "_name_or_path": "stabilityai/japanese-stablelm-instruct-alpha-7b",
+    "add_cross_attention": false,
+    "architectures": [
+      "JapaneseStableLMAlphaForCausalLM"
+    ],
+    "auto_map": {
+      "AutoConfig": "stabilityai/japanese-stablelm-instruct-alpha-7b--configuration_japanese_stablelm_alpha.JapaneseStableLMAlphaConfig",
+      "AutoModelForCausalLM": "stabilityai/japanese-stablelm-instruct-alpha-7b--modeling_japanese_stablelm_alpha.JapaneseStableLMAlphaForCausalLM"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 3,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.1,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 3,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 2048,
+    "min_length": 0,
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rotary_emb_base": 10000,
+    "rotary_pct": 0.25,
+    "rotary_scale_base": 512,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_bias_in_mlp": false,
+    "use_cache": true,
+    "use_parallel_residual": true,
+    "vocab_size": 65535
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "use_decoder_only_language_model": true,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1408,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_range": 1e-10,
+    "intermediate_size": 6144,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "instructblip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 39,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

configuration_japanese_instructblip_alpha.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# coding=utf-8
+# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Japanese InstructBLIP Alpha model configuration"""
+from transformers import (
+    PretrainedConfig,
+    InstructBlipConfig,
+    InstructBlipVisionConfig,
+    InstructBlipQFormerConfig,
+    AutoConfig,
+)
+from transformers.utils import logging
+from .configuration_japanese_stablelm_alpha import JapaneseStableLMAlphaConfig
+logger = logging.get_logger(__name__)
+class JapaneseInstructBlipAlphaConfig(InstructBlipConfig):
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        PretrainedConfig.__init__(self, **kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+        self.vision_config = InstructBlipVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
+        self.text_config = JapaneseStableLMAlphaConfig(**text_config)
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = True
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02

configuration_japanese_stablelm_alpha.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# coding=utf-8
+# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" JapaneseStableLMAlpha model configuration"""
+from transformers import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+STABLE_LM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class JapaneseStableLMAlphaConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the JapaneseStableLMAlphaModel. Defines the number of different tokens that
+            can be represented by the `inputs_ids` passed when calling [`JapaneseStableLMAlphaModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the decoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string).
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            Percentage of hidden dimensions to allocate to rotary embeddings.
+        rotary_emb_base (`int`, *optional*, defaults to 10000)
+            Base for computing rotary embeddings frequency.
+        rotary_scale_base (`int`, *optional*, defaults to 512)
+            Base `scale` for computing XPos rotary embeddings scale.
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model
+            [`StableLMForTokenClassification`]. The dropout ratio for the hidden layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 1e-5):
+            The standard deviation of the truncated_normal_initializer for initializing
+             all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions
+            (not used by all models). Only relevant if `config.is_decoder=True`.
+        use_parallel_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use a "parallel" formulation in each Transformer layer,
+            which can provide a slight training speedup at large scales.
+        Example:
+    ```python
+    >>> from transformers import JapaneseStableLMAlphaConfig, JapaneseStableLMAlphaModel
+    >>> # Initializing a JapaneseStableLMAlpha style configuration
+    >>> configuration = JapaneseStableLMAlphaConfig()
+    >>> # Initializing a model (with random weights) from the style configuration
+    >>> model = JapaneseStableLMAlphaModel(configuration)  # doctest: +SKIP
+    >>> # Accessing the model configuration
+    >>> configuration = model.config  # doctest: +SKIP
+    ```"""
+    def __init__(
+        self,
+        vocab_size=65536,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        rotary_pct=0.25,
+        rotary_emb_base=10000,
+        rotary_scale_base=512,
+        classifier_dropout=0.1,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=3,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        use_parallel_residual=True,
+        use_bias_in_mlp=True,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.rotary_scale_base = rotary_scale_base
+        self.classifier_dropout = classifier_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.use_parallel_residual = use_parallel_residual
+        self.use_bias_in_mlp = use_bias_in_mlp

japanese-instructblip-parrot.png ADDED Viewed

modeling_japanese_instructblip_alpha.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# coding=utf-8
+# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch JapaneseStableLMAlpha model. """
+import torch
+from torch import nn
+from transformers import (
+    InstructBlipPreTrainedModel,
+    InstructBlipVisionModel,
+    InstructBlipQFormerModel,
+    InstructBlipForConditionalGeneration,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+)
+from transformers.utils import logging
+from .modeling_japanese_stablelm_alpha import JapaneseStableLMAlphaForCausalLM
+from .configuration_japanese_instructblip_alpha import JapaneseInstructBlipAlphaConfig
+logger = logging.get_logger(__name__)
+class JapaneseInstructBlipAlphaForConditionalGeneration(InstructBlipForConditionalGeneration):
+    config_class = JapaneseInstructBlipAlphaConfig
+    def __init__(self, config: JapaneseInstructBlipAlphaConfig):
+        InstructBlipPreTrainedModel.__init__(self, config)
+        self.vision_model = InstructBlipVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            language_model = JapaneseStableLMAlphaForCausalLM(config.text_config)
+        else:
+            raise NotImplementedError
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config, trust_remote_code=True,)
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+        self.language_model = language_model
+        # Initialize weights and apply final processing
+        self.post_init()

modeling_japanese_stablelm_alpha.py ADDED Viewed

	@@ -0,0 +1,682 @@

+# coding=utf-8
+# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch JapaneseStableLMAlpha model. """
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_japanese_stablelm_alpha import JapaneseStableLMAlphaConfig
+logger = logging.get_logger(__name__)
+class JapaneseStableLMAlphaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = JapaneseStableLMAlphaConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, JapaneseStableLMAlphaModel):
+            module.gradient_checkpointing = value
+class JapaneseStableLMAlphaModel(JapaneseStableLMAlphaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_in
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size, seq_length = input_shape
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        # Attention mask.
+        if attention_mask is not None:
+            assert batch_size > 0, "batch_size has to be defined and > 0"
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for layer_past
+                        return module(*inputs, use_cache, None, output_attentions)
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    head_mask[i],
+                )
+            else:
+                outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+class DecoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            elementwise_affine=False,
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps
+        )
+        self.attention = Attention(config)
+        self.mlp = MLP(config)
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        attention_layer_outputs = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        outputs = attention_layer_outputs[1:]
+        mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+        hidden_states = hidden_states + mlp_output + attn_output
+        if use_cache:
+            outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
+        else:
+            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+        return outputs
+class MLP(nn.Module):
+    def __init__(self, config: JapaneseStableLMAlphaConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        multiple_of = 256
+        ff_dim = int(8 * hidden_size / 3)
+        intermediate_size = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+        self.packed_input_proj = torch.nn.Linear(hidden_size, 2 * intermediate_size, bias=False)
+        self.out_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.act = nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ff, ff_gate = self.packed_input_proj(x).chunk(2, dim=-1)
+        return self.out_proj(ff * self.act(ff_gate))
+class RotaryEmbedding(torch.nn.Module):
+    """Based on Tri Dao's XPos: https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/layers/rotary.py"""
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int,
+        base: int = 10_000,
+        scale_base: int = 512,
+        device: str = None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.seq_len_cached = max_position_embeddings
+        # Set up `inv_freq` term
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Set up `scale` term
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None else None
+        )
+        self.register_buffer("scale", scale)
+        # Seet up `cos..` and `sin...` cache terms
+        t = torch.arange(self.seq_len_cached, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        # freqs = torch.cat((freqs, freqs), dim=-1)
+        seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device)
+        power = (seq_range - self.seq_len_cached // 2) / self.scale_base
+        scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+        # scale_cached = torch.cat((scale_cached, scale_cached), dim=-1)
+        self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False)
+        self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False)
+        self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False)
+    def forward(self, x, seq_len=None):
+        if seq_len > self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq)
+            freqs = torch.cat((freqs, freqs), dim=-1)
+            seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device)
+            power = (seq_range - self.seq_len_cached // 2) / self.scale_base
+            scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+            scale_cached = torch.cat((scale_cached, scale_cached), dim=-1)
+            self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False)
+            self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False)
+            self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False)
+            self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False)
+        return (
+            self.cos_cached[:seq_len, ...],
+            self.sin_cached[:seq_len, ...],
+            self.cos_k_cached[:seq_len, ...],
+            self.sin_k_cached[:seq_len, ...],
+        )
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, cos_k=None, sin_k=None):
+    """
+    q, k: [bs, num_heads, seq_len, rot_dim]
+    cos, sin: [seq_len, rot_dim / 2]
+    position_ids: [bs, seq_len]
+    """
+    # print(f"q: {q.shape}, k: {k.shape}, cos: {cos.shape}, sin: {sin.shape}, position_ids: {position_ids.shape}")
+    import einops
+    cos = einops.repeat(cos, 's r -> s (2 r)')
+    sin = einops.repeat(sin, 's r -> s (2 r)')
+    cos_k = einops.repeat(cos_k, 's r -> s (2 r)')
+    sin_k = einops.repeat(sin_k, 's r -> s (2 r)')
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    cos_k = cos_k[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    sin_k = sin_k[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos_k) + (rotate_half(k) * sin_k)
+    return q_embed, k_embed
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them"
+            )
+        self.head_size = self.hidden_size // self.num_attention_heads
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims,
+            max_position_embeddings=config.max_position_embeddings,
+            base=config.rotary_emb_base,
+            scale_base=config.rotary_scale_base,
+        )
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        has_layer_past = layer_past is not None
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims :]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims :]
+        # Compute token offset for rotary embeddings (when decoding)
+        kv_seq_len = key.shape[-2]
+        if has_layer_past:
+            kv_seq_len += layer_past[0].shape[-2]
+        # Add rotary embeddings to query and key
+        # TODO: Check if using xpos
+        cos, sin, cos_k, sin_k = self.rotary_emb(value, seq_len=kv_seq_len)
+        query, key = apply_rotary_pos_emb(
+            query_rot, key_rot, cos, sin, position_ids, cos_k=cos_k, sin_k=sin_k)
+        query = torch.cat((query, query_pass), dim=-1)
+        key = torch.cat((key, key_pass), dim=-1)
+        # Cache QKV values
+        if has_layer_past:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        present = (key, value) if use_cache else None
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        # Merge attn_head_size dim and num_attn_heads dim into hidden dim
+        # [bs, seq_len, num_attention_heads, attn_head_size]
+        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
+        attn_output = attn_output.view(attn_output.size(0), attn_output.size(1), self.num_attention_heads * self.head_size)
+        attn_output = self.dense(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attn_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+        )
+        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
+        mask_value = torch.finfo(attn_scores.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype, device=attn_scores.device)
+        attn_scores = torch.where(causal_mask, attn_scores, mask_value)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_scores = attn_scores + attention_mask
+        # NOTE: Upcast to float32
+        attn_weights = nn.functional.softmax(attn_scores, dim=-1, dtype=torch.float32).type_as(value)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+def attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
+    return attention_scores
+class JapaneseStableLMAlphaForCausalLM(JapaneseStableLMAlphaPreTrainedModel):
+    _tied_weights_keys = ["embed_out.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = JapaneseStableLMAlphaModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.embed_out
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Example:
+        ```python
+        >>> import torch
+        >>> from transformers import LlamaTokenizer, JapaneseStableLMAlphaForCausalLM, JapaneseStableLMAlphaConfig
+        >>> tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1")
+        >>> config = JapaneseStableLMAlphaConfig.from_pretrained("stabilityai/stablelm-ja-base-alpha-7b")
+        >>> config.is_decoder = True
+        >>> model = JapaneseStableLMAlphaForCausalLM.from_pretrained("stabilityai/stablelm-ja-base-alpha-7b", config=config, trust_remote_code=True)
+        >>> inputs = tokenizer("日本語の美しいところは、", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        lm_logits = self.embed_out(hidden_states)
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        input_shape = input_ids.shape
+        # cut decoder_input_ids if past is used
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BlipImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "InstructBlipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

pytorch_model-00001-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be7874a30f77cac685b8b1c446e7ac97ea2a066c6e9dc4a75c43d7c45341eb9f
+size 9928428361

pytorch_model-00002-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbca4cbebac59e57652738c1fbbccd18abbc77faff5971774a74050f40593ae0
+size 9982874199

pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9152e6b29b5067795d63786f91a185282101c6fdcad843c40b208cc968d137d
+size 9714437907

pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5a02c272a7eaf9b8fb1248a52522f89533a432703e4764400fbdfac4d2fde8a
+size 3233898185

pytorch_model.bin.index.fp16.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.fp16-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edb6de9b5fc91eec3b6a57d6dacd14073001f37fa860e68775beaf3afb7d79dc
+size 9955835753

pytorch_model.fp16-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:528041e8ff85ea59f4ba0508efc852ef55c9fe15cedd2956c6f79ee8630f81f6
+size 6474190985

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ sentencepiece
2	+ einops