h2ogpt-chatbot

Runtime error

App Files Files Community

vs4vijay

pseudotensor commited on Apr 23, 2023

Commit

b6bff08

0 Parent(s):

Duplicate from h2oai/h2ogpt-chatbot

Browse files

Co-authored-by: Jonathan McKinney <pseudotensor@users.noreply.huggingface.co>

Files changed (11) hide show

.gitattributes +34 -0
LICENSE +201 -0
README.md +14 -0
app.py +1959 -0
client_test.py +93 -0
finetune.py +934 -0
h2o-logo.svg +1 -0
prompter.py +106 -0
requirements.txt +48 -0
stopping.py +139 -0
utils.py +154 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: H2ogpt Chatbot
+emoji: 📚
+colorFrom: yellow
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: h2oai/h2ogpt-chatbot
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1959 @@

+import functools
+import inspect
+import sys
+import os
+import traceback
+import typing
+from utils import set_seed, flatten_list, clear_torch_cache, system_info_print, zip_data, save_generate_output
+SEED = 1236
+set_seed(SEED)
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+from typing import Union
+import numpy as np
+import pandas as pd
+import fire
+import torch
+from peft import PeftModel
+from transformers import GenerationConfig, StoppingCriteriaList, AutoModel
+from accelerate import init_empty_weights, infer_auto_device_map
+from prompter import Prompter
+from finetune import get_loaders, example_data_points, generate_prompt, get_githash, prompt_types_strings, \
+    human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
+from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
+is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
+is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
+is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
+is_low_mem = is_hf  # assumes run on 24GB consumer GPU
+admin_pass = os.getenv("ADMIN_PASS")
+# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
+raise_generate_gpu_exceptions = True
+eval_extra_columns = ['prompt', 'response', 'score']
+def main(
+        load_8bit: bool = False,
+        load_half: bool = True,
+        infer_devices: bool = True,
+        base_model: str = '',
+        tokenizer_base_model: str = '',
+        lora_weights: str = "",
+        gpu_id: int = 0,  # if infer_devices = True and gpu_id != -1
+        prompt_type: Union[int, str] = None,
+        # input to generation
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        num_beams: int = None,
+        repetition_penalty: float = None,
+        num_return_sequences: int = None,
+        do_sample: bool = None,
+        max_new_tokens: int = None,
+        min_new_tokens: int = None,
+        early_stopping: Union[bool, str] = None,
+        max_time: float = None,
+        llama_type: bool = None,
+        debug: bool = False,
+        save_dir: str = None,
+        share: bool = True,
+        local_files_only: bool = False,
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,  # True requires CLI did huggingface-cli login before running
+        src_lang: str = "English",
+        tgt_lang: str = "Russian",
+        gradio: bool = True,
+        gradio_avoid_processing_markdown: bool = False,
+        chat: bool = True,
+        chat_history: int = 4096,  # character length of chat context/history
+        stream_output: bool = True,
+        show_examples: bool = None,
+        verbose: bool = False,
+        h2ocolors: bool = True,
+        height: int = 400,
+        show_lora: bool = True,
+        # set to True to load --base_model after client logs in,
+        # to be able to free GPU memory when model is swapped
+        login_mode_if_model0: bool = False,
+        sanitize_user_prompt: bool = True,
+        sanitize_bot_response: bool = True,
+        extra_model_options: typing.List[str] = [],
+        extra_lora_options: typing.List[str] = [],
+        score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
+        auto_score: bool = True,
+        eval_sharegpt_prompts_only: int = 0,
+        eval_sharegpt_prompts_only_seed: int = 1234,
+        eval_sharegpt_as_output: bool = False,
+):
+    # allow set token directly
+    use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    if is_public:
+        temperature = 0.4
+        top_p = 0.85
+        top_k = 70
+        do_sample = True
+        if is_low_mem:
+            base_model = 'h2oai/h2ogpt-oasst1-512-12b'
+            load_8bit = True
+        else:
+            base_model = 'h2oai/h2ogpt-oasst1-512-20b'
+    if is_low_mem:
+        load_8bit = True
+    if is_hf:
+        # must override share if in spaces
+        share = False
+    save_dir = os.getenv('SAVE_DIR', save_dir)
+    # get defaults
+    model_lower = base_model.lower()
+    if not gradio:
+        # force, else not single response like want to look at
+        stream_output = False
+        # else prompt removal can mess up output
+        chat = False
+    placeholder_instruction, placeholder_input, \
+    stream_output, show_examples, \
+    prompt_type, temperature, top_p, top_k, num_beams, \
+    max_new_tokens, min_new_tokens, early_stopping, max_time, \
+    repetition_penalty, num_return_sequences, \
+    do_sample, \
+    src_lang, tgt_lang, \
+    examples, \
+    task_info = \
+        get_generate_params(model_lower, chat,
+                            stream_output, show_examples,
+                            prompt_type, temperature, top_p, top_k, num_beams,
+                            max_new_tokens, min_new_tokens, early_stopping, max_time,
+                            repetition_penalty, num_return_sequences,
+                            do_sample,
+                            )
+    if not gradio:
+        if eval_sharegpt_prompts_only > 0:
+            # override default examples with shareGPT ones for human-level eval purposes only
+            eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
+            if not os.path.isfile(eval_filename):
+                os.system(
+                    'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
+            import json
+            data = json.load(open(eval_filename, 'rt'))
+            # focus on data that starts with human, else likely chopped from other data
+            turn_start = 0  # odd in general
+            data = [x for x in data if len(x['conversations']) > turn_start + 1 and
+                    x['conversations'][turn_start]['from'] == 'human' and
+                    x['conversations'][turn_start + 1]['from'] == 'gpt']
+            np.random.seed(eval_sharegpt_prompts_only_seed)
+            example1 = examples[-1]  # pick reference example
+            examples = []
+            responses = []
+            for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)):
+                assert data[i]['conversations'][turn_start]['from'] == 'human'
+                instruction = data[i]['conversations'][turn_start]['value']
+                assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
+                output = data[i]['conversations'][turn_start + 1]['value']
+                examplenew = example1.copy()
+                assert not chat, "No gradio must use chat=False, uses nochat isntruct"
+                examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
+                examplenew[eval_func_param_names.index('iinput_nochat')] = ''  # no input
+                examplenew[eval_func_param_names.index('context')] = ''  # no context
+                examples.append(examplenew)
+                responses.append(output)
+        num_examples = len(examples)
+        scoring_path = 'scoring'
+        os.makedirs(scoring_path, exist_ok=True)
+        if eval_sharegpt_as_output:
+            used_base_model = 'gpt35'
+            used_lora_weights = ''
+        else:
+            used_base_model = str(base_model.split('/')[-1])
+            used_lora_weights = str(lora_weights.split('/')[-1])
+        eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
+                                                                 eval_sharegpt_prompts_only_seed,
+                                                                 eval_sharegpt_as_output,
+                                                                 used_base_model,
+                                                                 used_lora_weights)
+        eval_filename = os.path.join(scoring_path, eval_filename)
+        with torch.device("cuda"):
+            # ensure was set right above before examples generated
+            assert not stream_output, "stream_output=True does not make sense with example loop"
+            import time
+            from functools import partial
+            # get score model
+            smodel, stokenizer, sdevice = get_score_model(**locals())
+            if not eval_sharegpt_as_output:
+                model, tokenizer, device = get_model(**locals())
+                model_state = [model, tokenizer, device, base_model]
+                fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
+            else:
+                assert eval_sharegpt_prompts_only > 0
+                def get_response(*args, exi=0):
+                    # assumes same ordering of examples and responses
+                    yield responses[exi]
+                fun = get_response
+            t0 = time.time()
+            score_dump = []
+            import matplotlib.pyplot as plt
+            for exi, ex in enumerate(examples):
+                instruction = ex[eval_func_param_names.index('instruction_nochat')]
+                iinput = ex[eval_func_param_names.index('iinput_nochat')]
+                context = ex[eval_func_param_names.index('context')]
+                clear_torch_cache()
+                print("")
+                print("START" + "=" * 100)
+                print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
+                print("-" * 105)
+                # fun yields as generator, so have to iterate over it
+                # Also means likely do NOT want --stream_output=True, else would show all generations
+                for res in fun(*tuple(ex), exi=exi):
+                    print(res)
+                    if smodel:
+                        score_with_prompt = False
+                        if score_with_prompt:
+                            data_point = dict(instruction=instruction, input=iinput, context=context)
+                            prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
+                            prompt = prompter.generate_prompt(data_point)
+                        else:
+                            # just raw input and output
+                            assert iinput in [None, '']  # should be no iinput
+                            assert context in [None, '']  # should be no context
+                            prompt = instruction
+                        cutoff_len = 768 if is_low_mem else 2048
+                        inputs = stokenizer(prompt, res,
+                                            return_tensors="pt",
+                                            truncation=True,
+                                            max_length=cutoff_len)
+                        try:
+                            score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
+                        except torch.cuda.OutOfMemoryError as e:
+                            print("GPU OOM: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
+                            traceback.print_exc()
+                            score = 0.0
+                            clear_torch_cache()
+                        except (Exception, RuntimeError) as e:
+                            if 'Expected all tensors to be on the same device' in str(e) or \
+                                    'expected scalar type Half but found Float' in str(e) or \
+                                    'probability tensor contains either' in str(e) or \
+                                    'cublasLt ran into an error!' in str(e):
+                                print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
+                                      flush=True)
+                                traceback.print_exc()
+                                score = 0.0
+                                clear_torch_cache()
+                            else:
+                                raise
+                        print("SCORE %s: %s" % (exi, score), flush=True)
+                        score_dump.append(ex + [prompt, res, score])
+                        # dump every score in case abort
+                        df_scores = pd.DataFrame(score_dump,
+                                                 columns=eval_func_param_names + eval_extra_columns)
+                        df_scores.to_parquet(eval_filename, index=False)
+                        # plot histogram so far
+                        plt.figure(figsize=(10, 10))
+                        plt.hist(df_scores['score'], bins=20)
+                        score_avg = np.mean(df_scores['score'])
+                        score_median = np.median(df_scores['score'])
+                        plt.title("Score avg: %s median: %s" % (score_avg, score_median))
+                        plt.savefig(eval_filename.replace('.parquet', '.png'))
+                        plt.close()
+                print("END" + "=" * 102)
+                print("")
+                t2 = time.time()
+                print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
+            t1 = time.time()
+            print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
+        return eval_filename
+    if gradio:
+        go_gradio(**locals())
+def get_device():
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        raise RuntimeError("only cuda supported")
+    return device
+def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
+                       gpu_id=0,
+                       use_auth_token=False):
+    """
+    Ensure model gets on correct device
+    :param base_model:
+    :param model_loader:
+    :param load_half:
+    :param model_kwargs:
+    :param reward_type:
+    :param gpu_id:
+    :param use_auth_token:
+    :return:
+    """
+    with init_empty_weights():
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
+        model = AutoModel.from_config(
+            config,
+        )
+    # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
+    # NOTE: Some models require avoiding sharding some layers,
+    # then would pass no_split_module_classes and give list of those layers.
+    device_map = infer_auto_device_map(
+        model,
+        dtype=torch.float16 if load_half else torch.float32,
+    )
+    if hasattr(model, 'model'):
+        device_map_model = infer_auto_device_map(
+            model.model,
+            dtype=torch.float16 if load_half else torch.float32,
+        )
+        device_map.update(device_map_model)
+    print('device_map: %s' % device_map, flush=True)
+    if gpu_id >= 0:
+        # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
+        # So avoid for now, just put on first GPU, unless score_model, put on last
+        n_gpus = torch.cuda.device_count()
+        if reward_type:
+            device_map = {'': n_gpus - 1}
+        else:
+            device_map = {'': min(n_gpus - 1, gpu_id)}
+    load_in_8bit = model_kwargs.get('load_in_8bit', False)
+    model_kwargs['device_map'] = device_map
+    if load_in_8bit or not load_half:
+        model = model_loader.from_pretrained(
+            base_model,
+            **model_kwargs,
+        )
+    else:
+        model = model_loader.from_pretrained(
+            base_model,
+            **model_kwargs,
+        ).half()
+    return model
+def get_model(
+        load_8bit: bool = False,
+        load_half: bool = True,
+        infer_devices: bool = True,
+        base_model: str = '',
+        tokenizer_base_model: str = '',
+        lora_weights: str = "",
+        gpu_id: int = 0,
+        llama_type: bool = None,
+        reward_type: bool = None,
+        local_files_only: bool = False,
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,
+        compile: bool = True,
+        **kwargs,
+):
+    """
+    :param load_8bit: load model in 8-bit, not supported by all models
+    :param load_half: load model in 16-bit
+    :param infer_devices: Use torch infer of optimal placement of layers on devices (for non-lora case)
+           For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
+           So it is not the default
+    :param base_model: name/path of base model
+    :param tokenizer_base_model: name/path of tokenizer
+    :param lora_weights: name/path
+    :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
+    :param llama_type: whether LLaMa type model
+    :param reward_type: reward type model for sequence classification
+    :param local_files_only: use local files instead of from HF
+    :param resume_download: resume downloads from HF
+    :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
+    :parm compile: whether to compile torch model
+    :param kwargs:
+    :return:
+    """
+    print("Get %s model" % base_model, flush=True)
+    if lora_weights is not None and lora_weights.strip():
+        print("Get %s lora weights" % lora_weights, flush=True)
+    device = get_device()
+    if 'gpt2' in base_model.lower():
+        # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
+        load_8bit = False
+    assert base_model.strip(), (
+        "Please choose a base model with --base_model (CLI) or in Models Tab (gradio)"
+    )
+    llama_type = llama_type or "llama" in base_model
+    model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=reward_type)
+    if not tokenizer_base_model:
+        tokenizer_base_model = base_model
+    if tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
+        tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
+                                                     local_files_only=local_files_only,
+                                                     resume_download=resume_download,
+                                                     use_auth_token=use_auth_token,
+                                                     )
+    else:
+        tokenizer = tokenizer_loader
+    if isinstance(tokenizer, str):
+        # already a pipeline, tokenizer_loader is string for task
+        model = model_loader(tokenizer,
+                             model=base_model,
+                             device=0 if device == "cuda" else -1,
+                             torch_dtype=torch.float16)
+    else:
+        assert device == "cuda", "Unsupported device %s" % device
+        model_kwargs = dict(local_files_only=local_files_only,
+                            torch_dtype=torch.float16,
+                            resume_download=resume_download,
+                            use_auth_token=use_auth_token)
+        if 'mbart-' not in base_model.lower():
+            model_kwargs.update(dict(load_in_8bit=load_8bit,
+                                     device_map={"": 0} if load_8bit else "auto",
+                                     ))
+        if 'OpenAssistant/reward-model'.lower() in base_model.lower():
+            # could put on other GPUs
+            model_kwargs['device_map'] = {"": 0}
+            model_kwargs.pop('torch_dtype', None)
+        if not lora_weights:
+            with torch.device("cuda"):
+                if infer_devices:
+                    model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
+                                               gpu_id=gpu_id, use_auth_token=use_auth_token)
+                else:
+                    if load_half and not load_8bit:
+                        model = model_loader.from_pretrained(
+                            base_model,
+                            **model_kwargs).half()
+                    else:
+                        model = model_loader.from_pretrained(
+                            base_model,
+                            **model_kwargs)
+        elif load_8bit:
+            model = model_loader.from_pretrained(
+                base_model,
+                **model_kwargs
+            )
+            model = PeftModel.from_pretrained(
+                model,
+                lora_weights,
+                torch_dtype=torch.float16,
+                local_files_only=local_files_only,
+                resume_download=resume_download,
+                use_auth_token=use_auth_token,
+                device_map={"": 0},  # seems to be required
+            )
+        else:
+            with torch.device("cuda"):
+                model = model_loader.from_pretrained(
+                    base_model,
+                    **model_kwargs
+                )
+                model = PeftModel.from_pretrained(
+                    model,
+                    lora_weights,
+                    torch_dtype=torch.float16,
+                    local_files_only=local_files_only,
+                    resume_download=resume_download,
+                    use_auth_token=use_auth_token,
+                    device_map="auto",
+                )
+                if load_half:
+                    model.half()
+    # unwind broken decapoda-research config
+    if llama_type:
+        model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
+        model.config.bos_token_id = 1
+        model.config.eos_token_id = 2
+    if 'gpt2' in base_model.lower():
+        # add special tokens that otherwise all share the same id
+        tokenizer.add_special_tokens({'bos_token': '<bos>',
+                                      'eos_token': '<eos>',
+                                      'pad_token': '<pad>'})
+    if not isinstance(tokenizer, str):
+        model.eval()
+        if torch.__version__ >= "2" and sys.platform != "win32" and compile:
+            model = torch.compile(model)
+    return model, tokenizer, device
+def get_score_model(**kwargs):
+    # score model
+    if kwargs.get('score_model') is not None and kwargs.get('score_model').strip():
+        score_all_kwargs = kwargs.copy()
+        score_all_kwargs['load_8bit'] = False
+        score_all_kwargs['load_half'] = False
+        score_all_kwargs['base_model'] = kwargs.get('score_model').strip()
+        score_all_kwargs['tokenizer_base_model'] = ''
+        score_all_kwargs['lora_weights'] = ''
+        score_all_kwargs['llama_type'] = False
+        score_all_kwargs['compile'] = False
+        smodel, stokenizer, sdevice = get_model(**score_all_kwargs)
+    else:
+        smodel, stokenizer, sdevice = None, None, None
+    return smodel, stokenizer, sdevice
+def go_gradio(**kwargs):
+    # get default model
+    all_kwargs = kwargs.copy()
+    all_kwargs.update(locals())
+    if kwargs.get('base_model') and not kwargs['login_mode_if_model0']:
+        model0, tokenizer0, device = get_model(**all_kwargs)
+    else:
+        # if empty model, then don't load anything, just get gradio up
+        model0, tokenizer0, device = None, None, None
+    model_state0 = [model0, tokenizer0, device, kwargs['base_model']]
+    # get score model
+    smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
+    if 'mbart-' in kwargs['model_lower']:
+        instruction_label_nochat = "Text to translate"
+    else:
+        instruction_label_nochat = "Instruction"
+    instruction_label = "You (Shift-Enter or push Submit to send message)"
+    title = 'h2oGPT'
+    if kwargs['verbose']:
+        description = f"""Model {kwargs['base_model']} Instruct dataset.
+                      For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).
+                      Command: {str(' '.join(sys.argv))}
+                      Hash: {get_githash()}
+                      """
+    else:
+        description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
+    if is_public:
+        description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content.  Use at own risk.</i></li>"""
+        if kwargs['load_8bit']:
+            description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
+        description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
+        description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
+    if kwargs['verbose']:
+        task_info_md = f"""
+        ### Task: {kwargs['task_info']}"""
+    else:
+        task_info_md = ''
+    css_code = """footer {visibility: hidden;}
+body{background:linear-gradient(#f5f5f5,#e5e5e5);}
+body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
+    from gradio.themes.utils import Color, colors, fonts, sizes
+    if kwargs['h2ocolors']:
+        h2o_yellow = Color(
+            name="yellow",
+            c50="#fffef2",
+            c100="#fff9e6",
+            c200="#ffecb3",
+            c300="#ffe28c",
+            c400="#ffd659",
+            c500="#fec925",
+            c600="#e6ac00",
+            c700="#bf8f00",
+            c800="#a67c00",
+            c900="#664d00",
+            c950="#403000",
+        )
+        h2o_gray = Color(
+            name="gray",
+            c50="#f2f2f2",
+            c100="#e5e5e5",
+            c200="#cccccc",
+            c300="#b2b2b2",
+            c400="#999999",
+            c500="#7f7f7f",
+            c600="#666666",
+            c700="#4c4c4c",
+            c800="#333333",
+            c900="#191919",
+            c950="#0d0d0d",
+        )
+        colors_dict = dict(primary_hue=h2o_yellow,
+                           secondary_hue=h2o_yellow,
+                           neutral_hue=h2o_gray,
+                           spacing_size=sizes.spacing_md,
+                           radius_size=sizes.radius_md,
+                           text_size=sizes.text_md,
+                           )
+    else:
+        colors_dict = dict(primary_hue=colors.indigo,
+                           secondary_hue=colors.indigo,
+                           neutral_hue=colors.gray,
+                           spacing_size=sizes.spacing_md,
+                           radius_size=sizes.radius_md,
+                           text_size=sizes.text_md,
+                           )
+    import gradio as gr
+    if kwargs['gradio_avoid_processing_markdown']:
+        from gradio_client import utils as client_utils
+        from gradio.components import Chatbot
+        # gradio has issue with taking too long to process input/output for markdown etc.
+        # Avoid for now, allow raw html to render, good enough for chatbot.
+        def _postprocess_chat_messages(self, chat_message: str):
+            if chat_message is None:
+                return None
+            elif isinstance(chat_message, (tuple, list)):
+                filepath = chat_message[0]
+                mime_type = client_utils.get_mimetype(filepath)
+                filepath = self.make_temp_copy_if_needed(filepath)
+                return {
+                    "name": filepath,
+                    "mime_type": mime_type,
+                    "alt_text": chat_message[1] if len(chat_message) > 1 else None,
+                    "data": None,  # These last two fields are filled in by the frontend
+                    "is_file": True,
+                }
+            elif isinstance(chat_message, str):
+                return chat_message
+            else:
+                raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
+        Chatbot._postprocess_chat_messages = _postprocess_chat_messages
+    demo = gr.Blocks(theme=gr.themes.Soft(**colors_dict), css=css_code, title="h2oGPT", analytics_enabled=False)
+    callback = gr.CSVLogger()
+    # css_code = 'body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en/site/header/master/_jcr_content/root/container/header_copy/logo.coreimg.svg/1678976605175/h2o-logo.svg");}'
+    # demo = gr.Blocks(theme='gstaff/xkcd', css=css_code)
+    model_options = flatten_list(list(prompt_type_to_model_name.values())) + kwargs['extra_model_options']
+    if kwargs['base_model'].strip() not in model_options:
+        lora_options = [kwargs['base_model'].strip()] + model_options
+    lora_options = kwargs['extra_lora_options']
+    if kwargs['lora_weights'].strip() not in lora_options:
+        lora_options = [kwargs['lora_weights'].strip()] + lora_options
+    # always add in no lora case
+    # add fake space so doesn't go away in gradio dropdown
+    no_lora_str = no_model_str = '[None/Remove]'
+    lora_options = [no_lora_str] + kwargs['extra_lora_options']  # FIXME: why double?
+    # always add in no model case so can free memory
+    # add fake space so doesn't go away in gradio dropdown
+    model_options = [no_model_str] + model_options
+    # transcribe, will be detranscribed before use by evaluate()
+    if not kwargs['lora_weights'].strip():
+        kwargs['lora_weights'] = no_lora_str
+    if not kwargs['base_model'].strip():
+        kwargs['base_model'] = no_model_str
+    # transcribe for gradio
+    kwargs['gpu_id'] = str(kwargs['gpu_id'])
+    no_model_msg = 'h2oGPT [   !!! Please Load Model in Models Tab !!!   ]'
+    output_label0 = f'h2oGPT [Model: {kwargs.get("base_model")}]' if kwargs.get(
+        'base_model') else no_model_msg
+    output_label0_model2 = no_model_msg
+    with demo:
+        # avoid actual model/tokenizer here or anything that would be bad to deepcopy
+        # https://github.com/gradio-app/gradio/issues/3558
+        model_state = gr.State(['model', 'tokenizer', device, kwargs['base_model']])
+        model_state2 = gr.State([None, None, None, None])
+        model_options_state = gr.State([model_options])
+        lora_options_state = gr.State([lora_options])
+        gr.Markdown(
+            f"""
+            <h1 align="center"> {title}</h1>
+            {description}
+            {task_info_md}
+            """)
+        if is_hf:
+            gr.HTML(
+                '''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
+        # go button visible if
+        base_wanted = kwargs['base_model'] != no_model_str and kwargs['login_mode_if_model0']
+        go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
+        normal_block = gr.Row(visible=not base_wanted)
+        with normal_block:
+            with gr.Tabs():
+                with gr.Row():
+                    col_nochat = gr.Column(visible=not kwargs['chat'])
+                    with col_nochat:  # FIXME: for model comparison, and check rest
+                        text_output_nochat = gr.Textbox(lines=5, label=output_label0)
+                        instruction_nochat = gr.Textbox(
+                            lines=4, label=instruction_label_nochat,
+                            placeholder=kwargs['placeholder_instruction'],
+                        )
+                        iinput_nochat = gr.Textbox(lines=4, label="Input context for Instruction",
+                                                   placeholder=kwargs['placeholder_input'])
+                        submit_nochat = gr.Button("Submit")
+                        flag_btn_nochat = gr.Button("Flag")
+                        if kwargs['score_model']:
+                            if not kwargs['auto_score']:
+                                with gr.Column():
+                                    score_btn_nochat = gr.Button("Score last prompt & response")
+                                    score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
+                            else:
+                                score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
+                    col_chat = gr.Column(visible=kwargs['chat'])
+                    with col_chat:
+                        with gr.Row():
+                            text_output = gr.Chatbot(label=output_label0).style(height=kwargs['height'] or 400)
+                            text_output2 = gr.Chatbot(label=output_label0_model2, visible=False).style(
+                                height=kwargs['height'] or 400)
+                        with gr.Row():
+                            with gr.Column(scale=50):
+                                instruction = gr.Textbox(
+                                    lines=4, label=instruction_label,
+                                    placeholder=kwargs['placeholder_instruction'],
+                                )
+                            with gr.Row():  # .style(equal_height=False, equal_width=False):
+                                submit = gr.Button(value='Submit').style(full_width=False, size='sm')
+                                stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
+                        with gr.Row():
+                            clear = gr.Button("New Conversation")
+                            flag_btn = gr.Button("Flag")
+                            if kwargs['score_model']:
+                                if not kwargs['auto_score']:  # FIXME: For checkbox model2
+                                    with gr.Column():
+                                        with gr.Row():
+                                            score_btn = gr.Button("Score last prompt & response").style(
+                                                full_width=False, size='sm')
+                                            score_text = gr.Textbox("Response Score: NA", show_label=False)
+                                        score_res2 = gr.Row(visible=False)
+                                        with score_res2:
+                                            score_btn2 = gr.Button("Score last prompt & response 2").style(
+                                                full_width=False, size='sm')
+                                            score_text2 = gr.Textbox("Response Score2: NA", show_label=False)
+                                else:
+                                    score_text = gr.Textbox("Response Score: NA", show_label=False)
+                                    score_text2 = gr.Textbox("Response Score2: NA", show_label=False, visible=False)
+                            retry = gr.Button("Regenerate")
+                            undo = gr.Button("Undo")
+                with gr.TabItem("Input/Output"):
+                    with gr.Row():
+                        if 'mbart-' in kwargs['model_lower']:
+                            src_lang = gr.Dropdown(list(languages_covered().keys()),
+                                                   value=kwargs['src_lang'],
+                                                   label="Input Language")
+                            tgt_lang = gr.Dropdown(list(languages_covered().keys()),
+                                                   value=kwargs['tgt_lang'],
+                                                   label="Output Language")
+                with gr.TabItem("Expert"):
+                    with gr.Row():
+                        with gr.Column():
+                            stream_output = gr.components.Checkbox(label="Stream output",
+                                                                   value=kwargs['stream_output'])
+                            prompt_type = gr.Dropdown(prompt_types_strings,
+                                                      value=kwargs['prompt_type'], label="Prompt Type",
+                                                      visible=not is_public)
+                            prompt_type2 = gr.Dropdown(prompt_types_strings,
+                                                       value=kwargs['prompt_type'], label="Prompt Type Model 2",
+                                                       visible=not is_public and False)
+                            do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
+                                                    value=kwargs['do_sample'])
+                            temperature = gr.Slider(minimum=0.01, maximum=3,
+                                                    value=kwargs['temperature'],
+                                                    label="Temperature",
+                                                    info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
+                            top_p = gr.Slider(minimum=0, maximum=1,
+                                              value=kwargs['top_p'], label="Top p",
+                                              info="Cumulative probability of tokens to sample from")
+                            top_k = gr.Slider(
+                                minimum=0, maximum=100, step=1,
+                                value=kwargs['top_k'], label="Top k",
+                                info='Num. tokens to sample from'
+                            )
+                            max_beams = 8 if not is_low_mem else 2
+                            num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
+                                                  value=min(max_beams, kwargs['num_beams']), label="Beams",
+                                                  info="Number of searches for optimal overall probability.  "
+                                                       "Uses more GPU memory/compute")
+                            max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
+                            max_new_tokens = gr.Slider(
+                                minimum=1, maximum=max_max_new_tokens, step=1,
+                                value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
+                            )
+                            min_new_tokens = gr.Slider(
+                                minimum=0, maximum=max_max_new_tokens, step=1,
+                                value=min(max_max_new_tokens, kwargs['min_new_tokens']), label="Min output length",
+                            )
+                            early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
+                                                         value=kwargs['early_stopping'])
+                            max_max_time = 60 * 5 if not is_low_mem else 60
+                            max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
+                                                 value=min(max_max_time, kwargs['max_time']), label="Max. time",
+                                                 info="Max. time to search optimal output.")
+                            repetition_penalty = gr.Slider(minimum=0.01, maximum=3.0,
+                                                           value=kwargs['repetition_penalty'],
+                                                           label="Repetition Penalty")
+                            num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
+                                                             value=kwargs['num_return_sequences'],
+                                                             label="Number Returns", info="Must be <= num_beams",
+                                                             visible=not is_public)
+                            iinput = gr.Textbox(lines=4, label="Input",
+                                                placeholder=kwargs['placeholder_input'],
+                                                visible=not is_public)
+                            context = gr.Textbox(lines=3, label="System Pre-Context",
+                                                 info="Directly pre-appended without prompt processing",
+                                                 visible=not is_public and not kwargs['chat'])
+                            chat = gr.components.Checkbox(label="Chat mode", value=kwargs['chat'],
+                                                          visible=not is_public)
+                with gr.TabItem("Models"):
+                    load_msg = "Load-Unload Model/LORA" if not is_public \
+                        else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO"
+                    load_msg2 = "Load-Unload Model/LORA 2" if not is_public \
+                        else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO 2"
+                    compare_checkbox = gr.components.Checkbox(label="Compare Mode",
+                                                              value=False, visible=not is_public)
+                    with gr.Row():
+                        n_gpus = torch.cuda.device_count()
+                        n_gpus_list = [str(x) for x in list(range(-1, n_gpus))]
+                        with gr.Column():
+                            with gr.Row(scale=1):
+                                with gr.Column(scale=50):
+                                    model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model",
+                                                               value=kwargs['base_model'])
+                                    lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA",
+                                                              value=kwargs['lora_weights'], visible=kwargs['show_lora'])
+                                with gr.Column(scale=1):
+                                    load_model_button = gr.Button(load_msg)
+                                    model_load8bit_checkbox = gr.components.Checkbox(
+                                        label="Load 8-bit [Not all models support]",
+                                        value=kwargs['load_8bit'])
+                                    model_infer_devices_checkbox = gr.components.Checkbox(
+                                        label="Infer Devices [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
+                                        value=kwargs['infer_devices'])
+                                    model_gpu = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
+                                                            value=kwargs['gpu_id'])
+                                    model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
+                                    lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
+                                                           visible=kwargs['show_lora'])
+                            with gr.Row(scale=1):
+                                with gr.Column(scale=50):
+                                    new_model = gr.Textbox(label="New Model HF name/path")
+                                    new_lora = gr.Textbox(label="New LORA HF name/path", visible=kwargs['show_lora'])
+                                with gr.Column(scale=1):
+                                    add_model_button = gr.Button("Add new model name")
+                                    add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
+                        col_model2 = gr.Column(visible=False)
+                        with col_model2:
+                            with gr.Row(scale=1):
+                                with gr.Column(scale=50):
+                                    model_choice2 = gr.Dropdown(model_options_state.value[0], label="Choose Model 2",
+                                                                value=no_model_str)
+                                    lora_choice2 = gr.Dropdown(lora_options_state.value[0], label="Choose LORA 2",
+                                                               value=no_lora_str,
+                                                               visible=kwargs['show_lora'])
+                                with gr.Column(scale=1):
+                                    load_model_button2 = gr.Button(load_msg2)
+                                    model_load8bit_checkbox2 = gr.components.Checkbox(
+                                        label="Load 8-bit 2 [Not all models support]",
+                                        value=kwargs['load_8bit'])
+                                    model_infer_devices_checkbox2 = gr.components.Checkbox(
+                                        label="Infer Devices 2 [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
+                                        value=kwargs[
+                                            'infer_devices'])
+                                    model_gpu2 = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
+                                                             value=kwargs['gpu_id'])
+                                    # no model/lora loaded ever in model2 by default
+                                    model_used2 = gr.Textbox(label="Current Model 2", value=no_model_str)
+                                    lora_used2 = gr.Textbox(label="Current LORA 2", value=no_lora_str,
+                                                            visible=kwargs['show_lora'])
+                with gr.TabItem("System"):
+                    system_row = gr.Row(visible=not is_public)
+                    admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
+                    admin_btn = gr.Button(value="admin", visible=is_public)
+                    with system_row:
+                        with gr.Column():
+                            system_text = gr.Textbox(label='System Info')
+                            system_btn = gr.Button(value='Get System Info')
+                            zip_btn = gr.Button("Zip")
+                            file_output = gr.File()
+        # Get flagged data
+        zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
+        zip_btn.click(zip_data1, inputs=None, outputs=file_output)
+        def check_admin_pass(x):
+            return gr.update(visible=x == admin_pass)
+        admin_btn.click(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row)
+        # Get inputs to evaluate()
+        inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
+        from functools import partial
+        all_kwargs = kwargs.copy()
+        all_kwargs.update(locals())
+        kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
+        fun = partial(evaluate,
+                      **kwargs_evaluate)
+        fun2 = partial(evaluate,
+                       model_state2,
+                       **kwargs_evaluate)
+        dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
+            size="sm",
+        )
+        dark_mode_btn.click(
+            None,
+            None,
+            None,
+            _js="""() => {
+            if (document.querySelectorAll('.dark').length) {
+                document.querySelectorAll('.dark').forEach(el => el.classList.remove('dark'));
+            } else {
+                document.querySelector('body').classList.add('dark');
+            }
+        }""",
+            api_name="dark",
+        )
+        # Control chat and non-chat blocks, which can be independently used by chat checkbox swap
+        def col_nochat_fun(x):
+            return gr.Column.update(visible=not x)
+        def col_chat_fun(x):
+            return gr.Column.update(visible=x)
+        def context_fun(x):
+            return gr.Textbox.update(visible=not x)
+        chat.select(col_nochat_fun, chat, col_nochat, api_name="chat_checkbox") \
+            .then(col_chat_fun, chat, col_chat) \
+            .then(context_fun, chat, context)
+        # examples after submit or any other buttons for chat or no chat
+        if kwargs['examples'] is not None and kwargs['show_examples']:
+            gr.Examples(examples=kwargs['examples'], inputs=inputs_list)
+        # Score
+        def score_last_response(*args, nochat=False, model2=False):
+            """ Similar to user() """
+            args_list = list(args)
+            max_length_tokenize = 512 if is_low_mem else 2048
+            cutoff_len = max_length_tokenize * 4  # restrict deberta related to max for LLM
+            if not nochat:
+                history = args_list[-1]
+                if history is None:
+                    if not model2:
+                        # maybe only doing first model, no need to complain
+                        print("Bad history in scoring last response, fix for now", flush=True)
+                    history = []
+                if smodel is not None and \
+                        stokenizer is not None and \
+                        sdevice is not None and \
+                        history is not None and len(history) > 0 and \
+                        history[-1] is not None and \
+                        len(history[-1]) >= 2:
+                    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+                    question = history[-1][0]
+                    answer = history[-1][1]
+                else:
+                    return 'Response Score: NA'
+            else:
+                answer = args_list[-1]
+                instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
+                question = args_list[instruction_nochat_arg_id]
+            if question is None:
+                return 'Response Score: Bad Question'
+            if answer is None:
+                return 'Response Score: Bad Answer'
+            question = question[-cutoff_len:]
+            answer = answer[-cutoff_len:]
+            inputs = stokenizer(question, answer,
+                                return_tensors="pt",
+                                truncation=True,
+                                max_length=max_length_tokenize).to(smodel.device)
+            try:
+                score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
+            except torch.cuda.OutOfMemoryError as e:
+                print("GPU OOM: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
+                del inputs
+                traceback.print_exc()
+                clear_torch_cache()
+                return 'Response Score: GPU OOM'
+            except (Exception, RuntimeError) as e:
+                if 'Expected all tensors to be on the same device' in str(e) or \
+                        'expected scalar type Half but found Float' in str(e) or \
+                        'probability tensor contains either' in str(e) or \
+                        'cublasLt ran into an error!' in str(e):
+                    print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
+                          flush=True)
+                    traceback.print_exc()
+                    clear_torch_cache()
+                    return 'Response Score: GPU Error'
+                else:
+                    raise
+            os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+            return 'Response Score: {:.1%}'.format(score)
+        if kwargs['score_model']:
+            score_args = dict(fn=score_last_response,
+                              inputs=inputs_list + [text_output],
+                              outputs=[score_text],
+                              )
+            score_args2 = dict(fn=partial(score_last_response, model2=True),
+                               inputs=inputs_list + [text_output2],
+                               outputs=[score_text2],
+                               )
+            score_args_nochat = dict(fn=partial(score_last_response, nochat=True),
+                                     inputs=inputs_list + [text_output_nochat],
+                                     outputs=[score_text_nochat],
+                                     )
+            if not kwargs['auto_score']:
+                score_event = score_btn.click(**score_args, queue=stream_output, api_name='score') \
+                    .then(**score_args2, queue=stream_output, api_name='score2')
+                score_event_nochat = score_btn_nochat.click(**score_args_nochat, queue=stream_output,
+                                                            api_name='score_nochat')
+        def user(*args, undo=False, sanitize_user_prompt=True, model2=False):
+            """
+            User that fills history for bot
+            :param args:
+            :param undo:
+            :param sanitize_user_prompt:
+            :param model2:
+            :return:
+            """
+            args_list = list(args)
+            user_message = args_list[0]
+            input1 = args_list[1]
+            context1 = args_list[2]
+            if input1 and not user_message.endswith(':'):
+                user_message1 = user_message + ":" + input1
+            elif input1:
+                user_message1 = user_message + input1
+            else:
+                user_message1 = user_message
+            if sanitize_user_prompt:
+                from better_profanity import profanity
+                user_message1 = profanity.censor(user_message1)
+            history = args_list[-1]
+            if undo and history:
+                history.pop()
+            args_list = args_list[:-1]  # FYI, even if unused currently
+            if history is None:
+                if not model2:
+                    # no need to complain so often unless model1
+                    print("Bad history, fix for now", flush=True)
+                history = []
+            # ensure elements not mixed across models as output,
+            # even if input is currently same source
+            history = history.copy()
+            if undo:
+                return history
+            else:
+                # FIXME: compare, same history for now
+                return history + [[user_message1, None]]
+        def bot(*args, retry=False):
+            """
+            bot that consumes history for user input
+            instruction (from input_list) itself is not consumed by bot
+            :param args:
+            :param retry:
+            :return:
+            """
+            args_list = list(args).copy()
+            history = args_list[-1]  # model_state is -2
+            if retry and history:
+                history.pop()
+            if not history:
+                print("No history", flush=True)
+                return
+            # ensure output will be unique to models
+            history = history.copy()
+            instruction1 = history[-1][0]
+            context1 = ''
+            if kwargs['chat_history'] > 0:
+                prompt_type_arg_id = eval_func_param_names.index('prompt_type')
+                prompt_type1 = args_list[prompt_type_arg_id]
+                chat_arg_id = eval_func_param_names.index('chat')
+                chat1 = args_list[chat_arg_id]
+                context1 = ''
+                for histi in range(len(history) - 1):
+                    data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
+                    context1 += generate_prompt(data_point, prompt_type1, chat1, reduced=True)[0].replace(
+                        '<br>', '\n')
+                    if not context1.endswith('\n'):
+                        context1 += '\n'
+                if context1 and not context1.endswith('\n'):
+                    context1 += '\n'  # ensure if terminates abruptly, then human continues on next line
+            args_list[0] = instruction1  # override original instruction with history from user
+            # only include desired chat history
+            args_list[2] = context1[-kwargs['chat_history']:]
+            model_state1 = args_list[-2]
+            if model_state1[0] is None or model_state1[0] == no_model_str:
+                return
+            args_list = args_list[:-2]
+            fun1 = partial(evaluate,
+                           model_state1,
+                           **kwargs_evaluate)
+            try:
+                for output in fun1(*tuple(args_list)):
+                    bot_message = output
+                    history[-1][1] = bot_message
+                    yield history
+            except StopIteration:
+                yield history
+            except RuntimeError as e:
+                if "generator raised StopIteration" in str(e):
+                    # assume last entry was bad, undo
+                    history.pop()
+                    yield history
+                raise
+            except Exception as e:
+                # put error into user input
+                history[-1][0] = "Exception: %s" % str(e)
+                yield history
+                raise
+            return
+        # NORMAL MODEL
+        user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
+                         inputs=inputs_list + [text_output],
+                         outputs=text_output,
+                         )
+        bot_args = dict(fn=bot,
+                        inputs=inputs_list + [model_state] + [text_output],
+                        outputs=text_output,
+                        )
+        retry_bot_args = dict(fn=functools.partial(bot, retry=True),
+                              inputs=inputs_list + [model_state] + [text_output],
+                              outputs=text_output,
+                              )
+        undo_user_args = dict(fn=functools.partial(user, undo=True),
+                              inputs=inputs_list + [text_output],
+                              outputs=text_output,
+                              )
+        # MODEL2
+        user_args2 = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt'], model2=True),
+                          inputs=inputs_list + [text_output2],
+                          outputs=text_output2,
+                          )
+        bot_args2 = dict(fn=bot,
+                         inputs=inputs_list + [model_state2] + [text_output2],
+                         outputs=text_output2,
+                         )
+        retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
+                               inputs=inputs_list + [model_state2] + [text_output2],
+                               outputs=text_output2,
+                               )
+        undo_user_args2 = dict(fn=functools.partial(user, undo=True),
+                               inputs=inputs_list + [text_output2],
+                               outputs=text_output2,
+                               )
+        def clear_instruct():
+            return gr.Textbox.update(value='')
+        if kwargs['auto_score']:
+            # in case 2nd model, consume instruction first, so can clear quickly
+            # bot doesn't consume instruction itself, just history from user, so why works
+            submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
+                .then(**user_args2, queue=stream_output, api_name='instruction2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='instruction_bot') \
+                .then(**score_args, api_name='instruction_bot_score') \
+                .then(**bot_args2, api_name='instruction_bot2') \
+                .then(**score_args2, api_name='instruction_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
+                .then(**user_args2, queue=stream_output, api_name='submit2') \
+                .then(**bot_args, api_name='submit_bot') \
+                .then(clear_instruct, None, instruction) \
+                .then(**score_args, api_name='submit_bot_score') \
+                .then(**bot_args2, api_name='submit_bot2') \
+                .then(**score_args2, api_name='submit_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
+                .then(**user_args2, queue=stream_output, api_name='retry2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**retry_bot_args, api_name='retry_bot') \
+                .then(**score_args, api_name='retry_bot_score') \
+                .then(**retry_bot_args2, api_name='retry_bot2') \
+                .then(**score_args2, api_name='retry_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
+                .then(**score_args, api_name='undo_score') \
+                .then(**undo_user_args2, queue=stream_output, api_name='undo2') \
+                .then(**score_args2, api_name='undo_score2') \
+                .then(clear_instruct, None, instruction)
+        else:
+            submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
+                .then(**user_args2, queue=stream_output, api_name='instruction2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='instruction_bot') \
+                .then(**bot_args2, api_name='instruction_bot2') \
+                .then(clear_torch_cache)
+            submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
+                .then(**user_args2, queue=stream_output, api_name='submit2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='submit_bot') \
+                .then(**bot_args2, api_name='submit_bot2') \
+                .then(clear_torch_cache)
+            submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
+                .then(**user_args2, queue=stream_output, api_name='retry2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**retry_bot_args, api_name='retry_bot') \
+                .then(**retry_bot_args2, api_name='retry_bot2') \
+                .then(clear_torch_cache)
+            submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
+                .then(**undo_user_args2, queue=stream_output, api_name='undo2')
+        # does both models
+        clear.click(lambda: None, None, text_output, queue=False, api_name='clear') \
+            .then(lambda: None, None, text_output2, queue=False, api_name='clear2')
+        # FIXME: compare
+        submit_event_nochat = submit_nochat.click(fun, inputs=[model_state] + inputs_list,
+                                                  outputs=text_output_nochat, api_name='submit_nochat') \
+            .then(**score_args_nochat, api_name='instruction_bot_score_nochat') \
+            .then(clear_torch_cache)
+        def load_model(model_name, lora_weights, model_state_old, prompt_type_old, load_8bit, infer_devices, gpu_id):
+            # ensure old model removed from GPU memory
+            if kwargs['debug']:
+                print("Pre-switch pre-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
+            if isinstance(model_state_old[0], str) and model0 is not None:
+                # best can do, move model loaded at first to CPU
+                model0.cpu()
+            if model_state_old[0] is not None and not isinstance(model_state_old[0], str):
+                try:
+                    model_state_old[0].cpu()
+                except Exception as e:
+                    # sometimes hit NotImplementedError: Cannot copy out of meta tensor; no data!
+                    print("Unable to put model on CPU: %s" % str(e), flush=True)
+                del model_state_old[0]
+                model_state_old[0] = None
+            if model_state_old[1] is not None and not isinstance(model_state_old[1], str):
+                del model_state_old[1]
+                model_state_old[1] = None
+            clear_torch_cache()
+            if kwargs['debug']:
+                print("Pre-switch post-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
+            if model_name is None or model_name == no_model_str:
+                # no-op if no model, just free memory
+                # no detranscribe needed for model, never go into evaluate
+                lora_weights = no_lora_str
+                return [None, None, None, model_name], model_name, lora_weights, prompt_type_old
+            all_kwargs1 = all_kwargs.copy()
+            all_kwargs1['base_model'] = model_name.strip()
+            all_kwargs1['load_8bit'] = load_8bit
+            all_kwargs1['infer_devices'] = infer_devices
+            all_kwargs1['gpu_id'] = int(gpu_id)  # detranscribe
+            model_lower = model_name.strip().lower()
+            if model_lower in inv_prompt_type_to_model_lower:
+                prompt_type1 = inv_prompt_type_to_model_lower[model_lower]
+            else:
+                prompt_type1 = prompt_type_old
+            # detranscribe
+            if lora_weights == no_lora_str:
+                lora_weights = ''
+            all_kwargs1['lora_weights'] = lora_weights.strip()
+            model1, tokenizer1, device1 = get_model(**all_kwargs1)
+            clear_torch_cache()
+            if kwargs['debug']:
+                print("Post-switch GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
+            return [model1, tokenizer1, device1, model_name], model_name, lora_weights, prompt_type1
+        def dropdown_prompt_type_list(x):
+            return gr.Dropdown.update(value=x)
+        def chatbot_list(x, model_used_in):
+            return gr.Textbox.update(label=f'h2oGPT [Model: {model_used_in}]')
+        load_model_args = dict(fn=load_model,
+                               inputs=[model_choice, lora_choice, model_state, prompt_type,
+                                       model_load8bit_checkbox, model_infer_devices_checkbox, model_gpu],
+                               outputs=[model_state, model_used, lora_used, prompt_type])
+        prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
+        chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
+        nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
+        if not is_public:
+            load_model_event = load_model_button.click(**load_model_args) \
+                .then(**prompt_update_args) \
+                .then(**chatbot_update_args) \
+                .then(**nochat_update_args) \
+                .then(clear_torch_cache)
+        load_model_args2 = dict(fn=load_model,
+                                inputs=[model_choice2, lora_choice2, model_state2, prompt_type2,
+                                        model_load8bit_checkbox2, model_infer_devices_checkbox2, model_gpu2],
+                                outputs=[model_state2, model_used2, lora_used2, prompt_type2])
+        prompt_update_args2 = dict(fn=dropdown_prompt_type_list, inputs=prompt_type2, outputs=prompt_type2)
+        chatbot_update_args2 = dict(fn=chatbot_list, inputs=[text_output2, model_used2], outputs=text_output2)
+        if not is_public:
+            load_model_event2 = load_model_button2.click(**load_model_args2) \
+                .then(**prompt_update_args2) \
+                .then(**chatbot_update_args2) \
+                .then(clear_torch_cache)
+        def dropdown_model_list(list0, x):
+            new_state = [list0[0] + [x]]
+            new_options = [*new_state[0]]
+            return gr.Dropdown.update(value=x, choices=new_options), \
+                   gr.Dropdown.update(value=x, choices=new_options), \
+                   '', new_state
+        add_model_event = add_model_button.click(fn=dropdown_model_list,
+                                                 inputs=[model_options_state, new_model],
+                                                 outputs=[model_choice, model_choice2, new_model, model_options_state])
+        def dropdown_lora_list(list0, x, model_used1, lora_used1, model_used2, lora_used2):
+            new_state = [list0[0] + [x]]
+            new_options = [*new_state[0]]
+            # don't switch drop-down to added lora if already have model loaded
+            x1 = x if model_used1 == no_model_str else lora_used1
+            x2 = x if model_used2 == no_model_str else lora_used2
+            return gr.Dropdown.update(value=x1, choices=new_options), \
+                   gr.Dropdown.update(value=x2, choices=new_options), \
+                   '', new_state
+        add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
+                                               inputs=[lora_options_state, new_lora, model_used, lora_used, model_used2, lora_used2],
+                                               outputs=[lora_choice, lora_choice2, new_lora, lora_options_state])
+        go_btn.click(lambda: gr.update(visible=False), None, go_btn, api_name="go") \
+            .then(lambda: gr.update(visible=True), None, normal_block) \
+            .then(**load_model_args).then(**prompt_update_args)
+        def compare_textbox_fun(x):
+            return gr.Textbox.update(visible=x)
+        def compare_column_fun(x):
+            return gr.Column.update(visible=x)
+        def compare_prompt_fun(x):
+            return gr.Dropdown.update(visible=x)
+        compare_checkbox.select(compare_textbox_fun, compare_checkbox, text_output2, api_name="compare_checkbox") \
+            .then(compare_column_fun, compare_checkbox, col_model2) \
+            .then(compare_prompt_fun, compare_checkbox, prompt_type2) \
+            .then(compare_textbox_fun, compare_checkbox, score_text2)
+        # FIXME: add score_res2 in condition, but do better
+        # callback for logging flagged input/output
+        callback.setup(inputs_list + [text_output], "flagged_data_points")
+        flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
+                       api_name='flag')
+        flag_btn_nochat.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
+                              api_name='flag_nochat')
+        def get_system_info():
+            return gr.Textbox.update(value=system_info_print())
+        system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
+        # don't pass text_output, don't want to clear output, just stop it
+        # FIXME: have to click once to stop output and second time to stop GPUs going
+        stop_btn.click(lambda: None, None, None,
+                       cancels=[submit_event_nochat, submit_event, submit_event2, submit_event3],
+                       queue=False, api_name='stop').then(clear_torch_cache)
+    demo.queue(concurrency_count=1)
+    favicon_path = "h2o-logo.svg"
+    demo.launch(share=kwargs['share'], server_name="0.0.0.0", show_error=True,
+                favicon_path=favicon_path, prevent_thread_lock=True)  # , enable_queue=True)
+    print("Started GUI", flush=True)
+    demo.block_thread()
+input_args_list = ['model_state']
+inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
+def get_inputs_list(inputs_dict, model_lower):
+    """
+    map gradio objects in locals() to inputs for evaluate().
+    :param inputs_dict:
+    :param model_lower:
+    :return:
+    """
+    inputs_list_names = list(inspect.signature(evaluate).parameters)
+    inputs_list = []
+    for k in inputs_list_names:
+        if k == 'kwargs':
+            continue
+        if k in input_args_list + inputs_kwargs_list:
+            # these are added via partial, not taken as input
+            continue
+        if 'mbart-' not in model_lower and k in ['src_lang', 'tgt_lang']:
+            continue
+        inputs_list.append(inputs_dict[k])
+    return inputs_list
+eval_func_param_names = ['instruction',
+                         'iinput',
+                         'context',
+                         'stream_output',
+                         'prompt_type',
+                         'temperature',
+                         'top_p',
+                         'top_k',
+                         'num_beams',
+                         'max_new_tokens',
+                         'min_new_tokens',
+                         'early_stopping',
+                         'max_time',
+                         'repetition_penalty',
+                         'num_return_sequences',
+                         'do_sample',
+                         'chat',
+                         'instruction_nochat',
+                         'iinput_nochat',
+                         ]
+def evaluate(
+        model_state,
+        # START NOTE: Examples must have same order of parameters
+        instruction,
+        iinput,
+        context,
+        stream_output,
+        prompt_type,
+        temperature,
+        top_p,
+        top_k,
+        num_beams,
+        max_new_tokens,
+        min_new_tokens,
+        early_stopping,
+        max_time,
+        repetition_penalty,
+        num_return_sequences,
+        do_sample,
+        chat,
+        instruction_nochat,
+        iinput_nochat,
+        # END NOTE: Examples must have same order of parameters
+        src_lang=None,
+        tgt_lang=None,
+        debug=False,
+        save_dir=None,
+        hard_stop_list=None,
+        sanitize_bot_response=True,
+        model_state0=None,
+        **kwargs,
+):
+    if debug:
+        locals_dict = locals().copy()
+        locals_dict.pop('model_state', None)
+        locals_dict.pop('model_state0', None)
+        print(locals_dict)
+    no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
+    if model_state0 is None:
+        # e.g. for no gradio case, set dummy value, else should be set
+        model_state0 = [None, None, None, None]
+    if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
+        # try to free-up original model (i.e. list was passed as reference)
+        if model_state0 is not None and model_state0[0] is not None:
+            model_state0[0].cpu()
+            model_state0[0] = None
+        # try to free-up original tokenizer (i.e. list was passed as reference)
+        if model_state0 is not None and model_state0[1] is not None:
+            model_state0[1] = None
+        clear_torch_cache()
+        model, tokenizer, device, base_model = model_state
+    elif model_state0 is not None and len(model_state0) == 4 and model_state0[0] is not None:
+        assert isinstance(model_state[0], str)
+        model, tokenizer, device, base_model = model_state0
+    else:
+        raise AssertionError(no_model_msg)
+    if base_model is None:
+        raise AssertionError(no_model_msg)
+    assert base_model.strip(), no_model_msg
+    assert model, "Model is missing"
+    assert tokenizer, "Tokenizer is missing"
+    # choose chat or non-chat mode
+    if not chat:
+        instruction = instruction_nochat
+        iinput = iinput_nochat
+    data_point = dict(context=context, instruction=instruction, input=iinput)
+    prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
+    prompt = prompter.generate_prompt(data_point)
+    if hard_stop_list is None:
+        # acts like undo on user entry and bot response
+        hard_stop_list = []
+    if isinstance(tokenizer, str):
+        # pipeline
+        if tokenizer == "summarization":
+            key = 'summary_text'
+        else:
+            raise RuntimeError("No such task type %s" % tokenizer)
+        # NOTE: uses max_length only
+        yield model(prompt, max_length=max_new_tokens)[0][key]
+    if 'mbart-' in base_model.lower():
+        assert src_lang is not None
+        tokenizer.src_lang = languages_covered()[src_lang]
+    if chat:
+        # override, ignore user change
+        num_return_sequences = 1
+    if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
+        if prompt_type == 'human_bot':
+            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
+            # stopping only starts once output is beyond prompt
+            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
+            stop_words = [human, bot, '\n' + human, '\n' + bot]
+            encounters = [1, 2]
+        elif prompt_type == 'instruct_vicuna':
+            # even below is not enough, generic strings and many ways to encode
+            stop_words = [
+                '### Human:',
+                """
+### Human:""",
+                """
+### Human:
+""",
+                '### Assistant:',
+                """
+### Assistant:""",
+                """
+### Assistant:
+""",
+            ]
+            encounters = [1, 2]
+        else:
+            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
+            stop_words = ['### End']
+            encounters = [1]
+        stop_words_ids = [
+            tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
+        # handle single token case
+        stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
+        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
+        # avoid padding in front of tokens
+        if tokenizer.pad_token:
+            stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
+        # handle fake \n added
+        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
+        # build stopper
+        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
+    else:
+        stopping_criteria = StoppingCriteriaList()
+    # help to avoid errors like:
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
+    # RuntimeError: expected scalar type Half but found Float
+    # with - 256
+    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
+    cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
+    output_smallest = 30 * 4
+    prompt = prompt[-cutoff_len - output_smallest:]
+    inputs = tokenizer(prompt,
+                       return_tensors="pt",
+                       truncation=True,
+                       max_length=max_length_tokenize)
+    if debug and len(inputs["input_ids"]) > 0:
+        print('input_ids length', len(inputs["input_ids"][0]), flush=True)
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=float(temperature),
+        top_p=float(top_p),
+        top_k=top_k,
+        num_beams=num_beams,
+        do_sample=do_sample,
+        repetition_penalty=float(repetition_penalty),
+        num_return_sequences=num_return_sequences,
+        renormalize_logits=True,
+        remove_invalid_values=True,
+        **kwargs,
+    )
+    gen_kwargs = dict(input_ids=input_ids,
+                      generation_config=generation_config,
+                      return_dict_in_generate=True,
+                      output_scores=True,
+                      max_new_tokens=max_new_tokens,  # prompt + new
+                      min_new_tokens=min_new_tokens,  # prompt + new
+                      early_stopping=early_stopping,  # False, True, "never"
+                      max_time=max_time,
+                      stopping_criteria=stopping_criteria,
+                      )
+    if 'gpt2' in base_model.lower():
+        gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id))
+    elif 'mbart-' in base_model.lower():
+        assert tgt_lang is not None
+        tgt_lang = languages_covered()[tgt_lang]
+        gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]))
+    else:
+        gen_kwargs.update(dict(pad_token_id=tokenizer.eos_token_id))
+    decoder = functools.partial(tokenizer.decode,
+                                skip_special_tokens=True,
+                                clean_up_tokenization_spaces=True,
+                                )
+    decoder_raw = functools.partial(tokenizer.decode,
+                                    skip_special_tokens=False,
+                                    clean_up_tokenization_spaces=True,
+                                    )
+    with torch.no_grad():
+        # decoded tokenized prompt can deviate from prompt due to special characters
+        inputs_decoded = decoder(input_ids[0])
+        inputs_decoded_raw = decoder_raw(input_ids[0])
+        if inputs_decoded == prompt:
+            # normal
+            pass
+        elif inputs_decoded.lstrip() == prompt.lstrip():
+            # sometimes extra space in front, make prompt same for prompt removal
+            prompt = inputs_decoded
+        elif inputs_decoded_raw == prompt:
+            # some models specify special tokens that are part of normal prompt, so can't skip them
+            inputs_decoded_raw = inputs_decoded
+            decoder = decoder_raw
+        else:
+            print("WARNING: Special characters in prompt", flush=True)
+        if stream_output:
+            def generate(callback=None, **kwargs):
+                # re-order stopping so Stream first and get out all chunks before stop for other reasons
+                stopping_criteria0 = kwargs.get('stopping_criteria', StoppingCriteriaList()).copy()
+                kwargs['stopping_criteria'] = StoppingCriteriaList()
+                kwargs['stopping_criteria'].append(Stream(func=callback))
+                for stopping_criteria1 in stopping_criteria0:
+                    kwargs['stopping_criteria'].append(stopping_criteria1)
+                try:
+                    model.generate(**kwargs)
+                except torch.cuda.OutOfMemoryError as e:
+                    print("GPU OOM: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
+                          flush=True)
+                    if kwargs['input_ids'] is not None:
+                        kwargs['input_ids'].cpu()
+                    kwargs['input_ids'] = None
+                    traceback.print_exc()
+                    clear_torch_cache()
+                    return
+                except (Exception, RuntimeError) as e:
+                    if 'Expected all tensors to be on the same device' in str(e) or \
+                            'expected scalar type Half but found Float' in str(e) or \
+                            'probability tensor contains either' in str(e) or \
+                            'cublasLt ran into an error!' in str(e):
+                        print(
+                            "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
+                            flush=True)
+                        traceback.print_exc()
+                        clear_torch_cache()
+                        if raise_generate_gpu_exceptions:
+                            raise
+                        return
+                    else:
+                        raise
+            decoded_output = None
+            for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
+                decoded_output = decoder(output)
+                if output[-1] in [tokenizer.eos_token_id]:
+                    if debug:
+                        print("HIT EOS", flush=True)
+                    break
+                if any(ele in decoded_output for ele in hard_stop_list):
+                    raise StopIteration
+                yield prompter.get_response(decoded_output, prompt=inputs_decoded,
+                                            sanitize_bot_response=sanitize_bot_response)
+            if save_dir and decoded_output:
+                save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
+        else:
+            outputs = model.generate(**gen_kwargs)
+            outputs = [decoder(s) for s in outputs.sequences]
+            yield prompter.get_response(outputs, prompt=inputs_decoded,
+                                        sanitize_bot_response=sanitize_bot_response)
+            if save_dir and outputs and len(outputs) >= 1:
+                decoded_output = prompt + outputs[0]
+                save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
+def get_generate_params(model_lower, chat,
+                        stream_output, show_examples,
+                        prompt_type, temperature, top_p, top_k, num_beams,
+                        max_new_tokens, min_new_tokens, early_stopping, max_time,
+                        repetition_penalty, num_return_sequences,
+                        do_sample):
+    use_defaults = False
+    use_default_examples = True
+    examples = []
+    task_info = f"{prompt_type}"
+    if model_lower:
+        print(f"Using Model {model_lower}", flush=True)
+    else:
+        print("No model defined yet", flush=True)
+    min_new_tokens = min_new_tokens if min_new_tokens is not None else 0
+    early_stopping = early_stopping if early_stopping is not None else False
+    max_time_defaults = 60 * 3
+    max_time = max_time if max_time is not None else max_time_defaults
+    if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
+        prompt_type = inv_prompt_type_to_model_lower[model_lower]
+    # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
+    if show_examples is None:
+        if chat:
+            show_examples = False
+        else:
+            show_examples = True
+    summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker?
+Philipp: Sure you can use the new Hugging Face Deep Learning Container.
+Jeff: ok.
+Jeff: and how can I get started?
+Jeff: where can I find documentation?
+Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face"""
+    if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower:
+        placeholder_instruction = summarize_example1
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = False
+        examples += [
+            [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
+             1.0, 1,
+             False]]
+        task_info = "Summarization"
+    elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower:
+        placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = True
+        task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc.  Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)"
+    elif 'mbart-' in model_lower:
+        placeholder_instruction = "The girl has long hair."
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = False
+        examples += [
+            [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
+             1.0, 1,
+             False]]
+    elif 'gpt2' in model_lower:
+        placeholder_instruction = "The sky is"
+        placeholder_input = ""
+        prompt_type = prompt_type or 'plain'
+        use_default_examples = True  # some will be odd "continuations" but can be ok
+        examples += [
+            [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
+             1.0, 1,
+             False]]
+        task_info = "Auto-complete phrase, code, etc."
+        use_defaults = True
+    else:
+        if chat:
+            placeholder_instruction = "Enter a question or imperative."
+        else:
+            placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
+        placeholder_input = ""
+        if model_lower:
+            prompt_type = prompt_type or 'human_bot'
+        else:
+            prompt_type = ''
+        examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
+                      stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
+                      False]]
+        task_info = "No task"
+        if prompt_type == 'instruct':
+            task_info = "Answer question or follow imperative as instruction with optionally input."
+        elif prompt_type == 'plain':
+            task_info = "Auto-complete phrase, code, etc."
+        elif prompt_type == 'human_bot':
+            if chat:
+                task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)"
+            else:
+                task_info = "Ask question/imperative (input concatenated with instruction)"
+    # revert to plain if still nothing
+    prompt_type = prompt_type or 'plain'
+    if use_defaults:
+        temperature = 1.0 if temperature is None else temperature
+        top_p = 1.0 if top_p is None else top_p
+        top_k = 40 if top_k is None else top_k
+        num_beams = num_beams or 1
+        max_new_tokens = max_new_tokens or 128
+        repetition_penalty = repetition_penalty or 1.07
+        num_return_sequences = min(num_beams, num_return_sequences or 1)
+        do_sample = False if do_sample is None else do_sample
+    else:
+        temperature = 0.1 if temperature is None else temperature
+        top_p = 0.75 if top_p is None else top_p
+        top_k = 40 if top_k is None else top_k
+        if chat:
+            num_beams = num_beams or 1
+        else:
+            num_beams = num_beams or 4
+        max_new_tokens = max_new_tokens or 256
+        repetition_penalty = repetition_penalty or 1.07
+        num_return_sequences = min(num_beams, num_return_sequences or 1)
+        do_sample = False if do_sample is None else do_sample
+    # doesn't include chat, instruction_nochat, iinput_nochat, added later
+    params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
+                   early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
+    if use_default_examples:
+        examples += [
+            ["Translate English to French", "Good morning"] + params_list,
+            ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list,
+            ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list,
+            [
+                "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.",
+                ''] + params_list,
+            ['Translate to German:  My name is Arthur', ''] + params_list,
+            ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list,
+            ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.',
+             ''] + params_list,
+            ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list,
+            ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list,
+            ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list,
+            [
+                "Premise: At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?",
+                ''] + params_list,
+            ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list,
+            [
+                'Answer the following question by reasoning step by step.  The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?',
+                ''] + params_list,
+            ["""def area_of_rectangle(a: float, b: float):
+    \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list,
+            ["""# a function in native python:
+def mean(a):
+    return sum(a)/len(a)
+# the same function using numpy:
+import numpy as np
+def mean(a):""", ''] + params_list,
+            ["""X = np.random.randn(100, 100)
+y = np.random.randint(0, 1, 100)
+# fit random forest classifier with 20 estimators""", ''] + params_list,
+        ]
+    src_lang = "English"
+    tgt_lang = "Russian"
+    # move to correct position
+    for example in examples:
+        example += [chat, '', '']
+        # adjust examples if non-chat mode
+        if not chat:
+            example[eval_func_param_names.index('instruction_nochat')] = example[
+                eval_func_param_names.index('instruction')]
+            example[eval_func_param_names.index('instruction')] = ''
+            example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
+            example[eval_func_param_names.index('iinput')] = ''
+    return placeholder_instruction, placeholder_input, \
+           stream_output, show_examples, \
+           prompt_type, temperature, top_p, top_k, num_beams, \
+           max_new_tokens, min_new_tokens, early_stopping, max_time, \
+           repetition_penalty, num_return_sequences, \
+           do_sample, \
+           src_lang, tgt_lang, \
+           examples, \
+           task_info
+def languages_covered():
+    # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered
+    covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)"""
+    covered = covered.split(', ')
+    covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered}
+    return covered
+def test_test_prompt(prompt_type='instruct', data_point=0):
+    example_data_point = example_data_points[data_point]
+    example_data_point.pop('output', None)
+    return generate_prompt(example_data_point, prompt_type, False, False)
+if __name__ == "__main__":
+    print("""
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
+    python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
+    python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
+    # generate without lora weights, no prompt
+    python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain'
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq'
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq'
+    # OpenChatKit settings:
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0
+    python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False
+    python generate.py --base_model='t5-large' --prompt_type='simple_instruct'
+    python generate.py --base_model='philschmid/bart-large-cnn-samsum'
+    python generate.py --base_model='philschmid/flan-t5-base-samsum'
+    python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt'
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
+    must have 4*48GB GPU and run without 8bit in order for sharding to work with infer_devices=False
+    can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
+    python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
+    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b
+    """, flush=True)
+    fire.Fire(main)

client_test.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Client test.
+Run server:
+python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b
+NOTE: For private models, add --use-auth_token=True
+NOTE: --infer_devices=True (default) must be used for multi-GPU in case see failures with cuda:x cuda:y mismatches.
+Currently, this will force model to be on a single GPU.
+Then run this client as:
+python client_test.py
+"""
+debug = False
+import os
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+from gradio_client import Client
+client = Client("http://localhost:7860")
+if debug:
+    print(client.view_api(all_endpoints=True))
+instruction = ''  # only for chat=True
+iinput = ''  # only for chat=True
+context = ''
+# streaming output is supported, loops over and outputs each generation in streaming mode
+# but leave stream_output=False for simple input/output mode
+stream_output = False
+prompt_type = 'human_bot'
+temperature = 0.1
+top_p = 0.75
+top_k = 40
+num_beams = 1
+max_new_tokens = 50
+min_new_tokens = 0
+early_stopping = False
+max_time = 20
+repetition_penalty = 1.0
+num_return_sequences = 1
+do_sample = True
+# only these 2 below used if pass chat=False
+chat = False
+instruction_nochat = "Who are you?"
+iinput_nochat = ''
+def test_client_basic():
+    args = [instruction,
+            iinput,
+            context,
+            stream_output,
+            prompt_type,
+            temperature,
+            top_p,
+            top_k,
+            num_beams,
+            max_new_tokens,
+            min_new_tokens,
+            early_stopping,
+            max_time,
+            repetition_penalty,
+            num_return_sequences,
+            do_sample,
+            chat,
+            instruction_nochat,
+            iinput_nochat,
+            ]
+    api_name = '/submit_nochat'
+    res = client.predict(
+        *tuple(args),
+        api_name=api_name,
+    )
+    res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
+    print(res_dict)
+import markdown  # pip install markdown
+from bs4 import BeautifulSoup  # pip install beautifulsoup4
+def md_to_text(md):
+    html = markdown.markdown(md)
+    soup = BeautifulSoup(html, features='html.parser')
+    return soup.get_text()
+if __name__ == '__main__':
+    test_client_basic()

finetune.py ADDED Viewed

	@@ -0,0 +1,934 @@

+import os
+import pathlib
+import random
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime
+from typing import List, Union
+import fire
+import numpy as np
+import torch
+from datasets import load_dataset, concatenate_datasets
+import transformers
+import torch.distributed as dist
+from peft import (
+    prepare_model_for_int8_training,
+    LoraConfig,
+    get_peft_model,
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+)
+from peft import mapping
+lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+def log(*args, **kwargs):
+    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+        print(*args, **kwargs)
+try:
+    import neptune
+    from transformers.integrations import NeptuneCallback
+    neptune_run = neptune.init_run(
+        source_files=[],
+    )
+    log("Connected to Neptune.")
+except ImportError:
+    neptune_run = None
+    log("Please pip install neptune for tracking.")
+except neptune.exceptions.NeptuneMissingApiTokenException:
+    neptune_run = None
+    os.environ["NEPTUNE_MODE"] = 'debug'
+    log("No neptune configured, set NEPTUNE_API_TOKEN env var.")
+from enum import Enum
+class PromptType(Enum):
+    plain = 0
+    instruct = 1
+    quality = 2
+    human_bot = 3
+    dai_faq = 4
+    summarize = 5
+    simple_instruct = 6
+    instruct_vicuna = 7
+    instruct_with_end = 8
+    human_bot_orig = 9
+prompt_type_to_model_name = {
+    'plain': [
+        'EleutherAI/gpt-j-6B',
+        'EleutherAI/pythia-6.9b',
+        'EleutherAI/pythia-12b',
+        'EleutherAI/pythia-12b-deduped',
+        'EleutherAI/gpt-neox-20b',
+        'decapoda-research/llama-7b-hf',
+        'decapoda-research/llama-13b-hf',
+        'decapoda-research/llama-30b-hf',
+        'decapoda-research/llama-65b-hf',
+        'facebook/mbart-large-50-many-to-many-mmt',
+        'philschmid/bart-large-cnn-samsum',
+        'philschmid/flan-t5-base-samsum',
+        'gpt2',
+        'distilgpt2',
+    ],
+    'instruct': [],
+    'instruct_with_end': ['databricks/dolly-v2-12b'],
+    'quality': [],
+    'human_bot': [
+        'h2oai/h2ogpt-oig-oasst1-256-12b',
+        'h2oai/h2ogpt-oasst1-512-12b',
+        'h2oai/h2ogpt-oasst1-256-20b',
+        'h2oai/h2ogpt-oasst1-512-20b',
+        'h2oai/h2ogpt-oig-oasst1-256-6.9b',
+    ],
+    'dai_faq': [],
+    'summarize': [],
+    'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
+    'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b'],
+    'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
+}
+inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
+inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
+human = '<human>:'
+bot = "<bot>:"
+prompt_types_strings = []
+for p in PromptType:
+    prompt_types_strings.extend([p.name])
+prompt_types = []
+for p in PromptType:
+    prompt_types.extend([p.name, p.value, str(p.value)])
+# supported by huggingface evaluate
+supported_metrics = ['bleu', 'rouge', 'sacrebleu', 'meteor']
+def train(
+        save_code: bool = False,
+        run_id: int = None,
+        base_model: str = 'h2oai/h2ogpt-oig-oasst1-512-6.9b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
+        # base_model: str = 'EleutherAI/gpt-neox-20b',
+        # base_model: str = 'EleutherAI/pythia-12b-deduped',
+        # base_model: str = 'togethercomputer/GPT-NeoXT-Chat-Base-20B',
+        # base_model: str = 'decapoda-research/llama-7b-hf',
+        # base_model: str = 'decapoda-research/llama-13b-hf',
+        # base_model: str = 'decapoda-research/llama-30b-hf',
+        # base_model: str = 'EleutherAI/gpt-j-6B',
+        # only needed if base_model is self-exported HF state without tokenizer
+        tokenizer_base_model: str = None,
+        # tokenizer_base_model: str = 'EleutherAI/gpt-neox-20b',
+        data_path: str = None,
+        data_col_dict: dict = None,
+        # data_path: str = "./dai_docs.train.json",
+        prompt_type: Union[str, int] = "plain",  # "plain", "instruct", "quality", "human_bot", "dai_faq"
+        valid_path: str = None,
+        # valid_path: str = "./dai_docs.valid.json",
+        # data_mix_in_path: str = "laion/OIG",  # way too big, medium quality
+        data_mix_in_path: str = "0-hero/OIG-small-chip2",  # high quality, 50 MB, good enough for now
+        data_mix_in_factor: float = 0.0,  # >1: more mix-in data, <1: more of data_path data
+        data_mix_in_col_dict: dict = {'user': 'instruction', 'chip2': 'output'},
+        data_mix_in_prompt_type: str = "instruct",  # just instruction->output, same as instruct
+        output_dir: str = None,
+        # LoRA checkpoint continuation
+        lora_weights: str = "",
+        # batching training hyperparams
+        batch_size: int = 128,
+        micro_batch_size: int = 4,
+        gradient_checkpointing=False,  # unnecessary with gradient accumulation enabled
+        fp16=True,
+        # general training hyperparams
+        num_epochs: float = 1,
+        learning_rate: float = 3e-4,
+        # validation settings
+        val_set_size: int = None,
+        val_metrics: List[str] = [],
+        eval_steps: int = None,  # to control eval steps via steps
+        eval_epochs: float = None,  # to control eval steps via epochs
+        # lora hyperparams
+        lora_r: int = 8,
+        lora_alpha: int = 16,
+        lora_dropout: float = 0.05,
+        lora_target_modules: List[str] = None,
+        llama_type: bool = None,
+        # llm hyperparams
+        train_on_inputs: bool = True,  # if False, masks out inputs in loss
+        group_by_length: bool = False,  # if True, faster, but produces an odd training loss curve
+        resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
+        cutoff_len: int = 1024,  # Good default, especially when have high quality non-trivial data
+        # torch training params
+        ddp: bool = True,  # set to False if OOM with True, for multi-GPU model parallelism
+        local_files_only: bool = False,  # else will download new versions, normally unwanted
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,  # True requires CLI did huggingface-cli login before running
+        warmup_steps: int = 100,
+        logging_steps: int = 1,
+        save_steps: int = None,  # must be round multiple of eval_steps
+        add_eos_token: bool = False,
+):
+    # allow set token directly
+    use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    prompt_type = str(prompt_type)  # migration from integers
+    assert prompt_type in prompt_types
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    rank = int(os.getenv("RANK", 0))
+    print(f"local_rank: {local_rank}")
+    print(f"global rank: {rank}")
+    gpus = max(world_size, torch.cuda.device_count())
+    run_id = run_id or 0
+    if not data_path:
+        raise ValueError("No data_path provided")
+    if not output_dir:
+        output_dir = f"{base_model.split('/')[-1]}.{data_path.replace('/', '')}.{num_epochs}_epochs.{get_githash() or 'nogit'}.{run_id}"
+        if os.path.exists(output_dir) and not resume_from_checkpoint:
+            raise FileExistsError(f"output_dir based on run_id {run_id} already exists. Please pick a different run_id.")
+    else:
+        if os.path.exists(output_dir) and not resume_from_checkpoint:
+            raise FileExistsError(f"output_dir {output_dir} already exists. Please pick a different output_dir, or specify a run_id instead.")
+    device_map = "auto"
+    if save_code:
+        copy_code(run_id)
+    if tokenizer_base_model is None:
+        tokenizer_base_model = base_model
+    if llama_type is None:
+        llama_type = "llama" in base_model.lower()
+    assert (
+        base_model
+    ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"
+    gradient_accumulation_steps = batch_size // micro_batch_size
+    assert gradient_accumulation_steps >= world_size, "must increase batch_size for multi-GPU"
+    device_map = "auto"
+    locals_dict = locals()
+    locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
+    log(f"Training model with params:\n{locals_print}")
+    log("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()))
+    max_memory = None
+    if gpus > 1:
+        if ddp:
+            log("Distributed: data parallel")
+            device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
+            gradient_accumulation_steps = gradient_accumulation_steps // world_size
+        else:
+            free_in_GB = int(min(torch.cuda.mem_get_info()) / 1024 ** 3)
+            max_memory = f"{free_in_GB - 2}GB"
+            max_memory = {i: max_memory for i in range(gpus)}
+            log("world_size: %d" % world_size)
+            log("num_gpus: %d" % gpus)
+            log("max mem: %s" % max_memory)
+    model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
+    model = model_loader.from_pretrained(
+        base_model,
+        load_in_8bit=True,
+        device_map=device_map,
+        torch_dtype=torch.float16,
+        max_memory=max_memory,
+        local_files_only=local_files_only,
+        resume_download=resume_download,
+        use_auth_token=use_auth_token,
+    )
+    if gpus > 1:
+        if not ddp:
+            log("model parallel")
+            model.is_parallelizable = True
+            model.model_parallel = True
+    tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
+                                                 local_files_only=local_files_only,
+                                                 resume_download=resume_download,
+                                                 use_auth_token=use_auth_token)
+    tokenizer.pad_token_id = 0  # different from the eos token
+    # when generating, we will use the logits of right-most token to predict the next token
+    # so the padding should be on the left,
+    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
+    tokenizer.padding_side = "left"  # Allow batched inference
+    def tokenize(prompt, add_eos_token=True):
+        # there's probably a way to do this with the tokenizer settings
+        # but again, gotta move fast
+        result = tokenizer(
+            prompt,
+            truncation=True,
+            max_length=cutoff_len,
+            padding=False,
+            return_tensors=None,
+        )
+        if (
+                result["input_ids"][-1] != tokenizer.eos_token_id
+                and len(result["input_ids"]) < cutoff_len
+                and add_eos_token
+        ):
+            result["input_ids"].append(tokenizer.eos_token_id)
+            result["attention_mask"].append(1)
+        result["labels"] = result["input_ids"].copy()
+        return result
+    def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
+        full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
+        tokenized_full_prompt = tokenize(full_prompt)
+        if not train_on_inputs:
+            user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
+            tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos)
+            user_prompt_len = len(tokenized_user_prompt["input_ids"])
+            if add_eos:
+                user_prompt_len -= 1
+            # ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
+            tokenized_full_prompt["labels"] = [
+                                                  -100
+                                              ] * user_prompt_len + tokenized_full_prompt["labels"][
+                                                                    user_prompt_len:
+                                                                    ]  # could be sped up, probably
+        return tokenized_full_prompt
+    if "gpt-neox" not in base_model or True:
+        model = prepare_model_for_int8_training(model)
+    else:
+        model = prepare_model_for_int8_training(
+            model,
+            output_embedding_layer_name="embed_out",  # keep output logits in float32
+            layer_norm_names=["layer_norm", "layernorm"],  # keep all layer norms in higher precision
+        )
+    if lora_weights:
+        from peft import PeftModel
+        model = PeftModel.from_pretrained(
+            model,
+            lora_weights,
+            torch_dtype=torch.float16,
+            device_map=device_map,
+            local_files_only=local_files_only,
+            resume_download=resume_download,
+            use_auth_token=use_auth_token,
+        )
+    else:
+        if lora_target_modules is None:
+            base_model_lower = base_model.lower()
+            if base_model_lower in lora_mappings:
+                lora_target_modules_cand = [lora_mappings[base_model_lower]]
+            else:
+                lora_target_modules_cand = [["query_key_value"], ["q_proj", "v_proj"]]
+        else:
+            lora_target_modules_cand = [lora_target_modules]
+        for lora_target_modules in lora_target_modules_cand:
+            try:
+                config = LoraConfig(
+                    r=lora_r,
+                    lora_alpha=lora_alpha,
+                    target_modules=lora_target_modules,
+                    lora_dropout=lora_dropout,
+                    bias="none",
+                    task_type="CAUSAL_LM",
+                )
+                model = get_peft_model(model, config)
+                break
+            except ValueError as e:
+                if "Target modules" in str(e) and "not found" in str(e):
+                    continue
+                else:
+                    raise
+        from peft import PeftModel
+        assert isinstance(model, PeftModel), "LoRA failed. Please provide --lora_target_modules explicitly."
+    if resume_from_checkpoint:
+        # Check the available weights and load them
+        checkpoint_name = os.path.join(
+            resume_from_checkpoint, "pytorch_model.bin"
+        )  # Full checkpoint
+        if not os.path.exists(checkpoint_name):
+            checkpoint_name = os.path.join(
+                resume_from_checkpoint, "adapter_model.bin"
+            )  # only LoRA model - LoRA config above has to fit
+            resume_from_checkpoint = False  # So the trainer won't try loading its state
+        # The two files above have a different name depending on how they were saved, but are actually the same.
+        if os.path.exists(checkpoint_name):
+            log(f"Restarting from {checkpoint_name}")
+            adapters_weights = torch.load(checkpoint_name)
+            model = set_peft_model_state_dict(model, adapters_weights)
+        else:
+            log(f"Checkpoint {checkpoint_name} not found")
+    print(model)
+    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
+    metrics = {}
+    for name in supported_metrics:
+        if name in val_metrics:
+            import evaluate  # Causes hang for 'python generate.py' on dual 4090 if imported early, 100% reproducible
+            metrics[name] = evaluate.load(name)
+    log("Using Validation Metrics: %s" % str(list(metrics.keys())))
+    log("Supported Metrics: %s" % supported_metrics)
+    if val_set_size is None:
+        if len(metrics) == 0:
+            val_set_size = 1000
+        else:
+            val_set_size = 100
+        log("Auto set val_set_size %s" % val_set_size)
+    elif val_set_size < 1.0 and val_set_size != 0:
+        raise RuntimeError("Fractional validation size not supported.")
+    if valid_path:
+        data = load_dataset("json", data_files={"train": data_path, "valid": valid_path})
+    else:
+        if "json" in data_path:
+            data = load_dataset("json", data_files={"train": data_path})
+        else:
+            data = load_dataset(data_path)
+            data = data.rename_columns(data_col_dict or {})
+    valid_data = None
+    train_data_mix_in = None
+    valid_data_mix_in = None
+    if data_mix_in_path and data_mix_in_factor > 0:
+        # get mix-in training/validation data - to keep model "sane"
+        num_rows = data["train"].num_rows
+        log("Loading mix-in dataset: %s" % data_mix_in_path)
+        if "json" in data_mix_in_path:
+            data_mix_in = load_dataset("json", data_files={"train": data_mix_in_path})["train"]
+        else:
+            data_mix_in = load_dataset(data_mix_in_path)["train"]  # can be large
+        data_mix_in = data_mix_in.rename_columns(data_mix_in_col_dict or {})
+        # only get as much as we need to balance
+        valid_size = min(data_mix_in.num_rows // 2, val_set_size or 0)
+        train_size = max(1, min(data_mix_in.num_rows - valid_size, int(num_rows * data_mix_in_factor)))
+        mixin_small = data_mix_in.train_test_split(
+            test_size=train_size + valid_size,
+            shuffle=True, seed=np.random.randint(10000),
+        )["test"]
+        if valid_size:
+            mixin_train_test = mixin_small.train_test_split(
+                test_size=valid_size, shuffle=False,
+            )
+            train_data_mix_in = mixin_train_test["train"]
+            valid_data_mix_in = mixin_train_test["test"]
+        else:
+            train_data_mix_in = mixin_small
+        if "prompt_type" not in train_data_mix_in.column_names:
+            train_data_mix_in = train_data_mix_in.add_column(
+                "prompt_type",
+                [data_mix_in_prompt_type] * train_data_mix_in.num_rows,
+            )
+            log("Added prompt type %s to mix-in training data" % data_mix_in_prompt_type)
+        if valid_data_mix_in and "prompt_type" not in valid_data_mix_in.column_names:
+            valid_data_mix_in = valid_data_mix_in.add_column(
+                "prompt_type",
+                [data_mix_in_prompt_type] * valid_data_mix_in.num_rows,
+            )
+            log("Added prompt type %s to mix-in validation data" % data_mix_in_prompt_type)
+        log("Created mix-in data:\nTrain %s\nValid %s" % (train_data_mix_in, valid_data_mix_in))
+    # get our own training/validation data - for fine-tuning
+    if val_set_size > 0 and not valid_path and not data_mix_in_path:
+        # create valid split from train
+        train_val = data["train"].train_test_split(
+            test_size=val_set_size, shuffle=True, seed=42
+        )
+        train_data = train_val["train"]
+        valid_data = train_val["test"]
+    else:
+        train_data = data["train"]
+        if valid_path:
+            # use given valid split, has priority over data_mix_in_path
+            valid_data = data["valid"]
+    if "prompt_type" not in train_data.column_names:
+        train_data = train_data.add_column(
+            "prompt_type",
+            [prompt_type] * train_data.num_rows,
+        )
+        log("Added prompt type %s to training data" % prompt_type)
+    if valid_data and "prompt_type" not in valid_data.column_names:
+        valid_data = valid_data.add_column(
+            "prompt_type",
+            [prompt_type] * valid_data.num_rows,
+        )
+        log("Added prompt type %s to validation data" % prompt_type)
+    assert train_data is not None
+    # shuffle and tokenize data
+    if train_data_mix_in:
+        train_data = concatenate_datasets([train_data, train_data_mix_in])
+    train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
+    train_set_size = len(train_data)
+    if valid_data and valid_data_mix_in:
+        valid_data = concatenate_datasets([valid_data, valid_data_mix_in])
+    elif valid_data_mix_in:
+        valid_data = valid_data_mix_in
+    if valid_data:
+        valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
+        val_set_size = len(valid_data)
+    else:
+        val_set_size = 0
+    log("Final fine-tuning data:\nTrain %s\nValid %s" % (train_data, valid_data))
+    sample_row_dict = train_data[:1]
+    del sample_row_dict['input_ids']
+    del sample_row_dict['attention_mask']
+    del sample_row_dict['labels']
+    log("Sample input: %s" % sample_row_dict)
+    if neptune_run:
+        neptune_callback = NeptuneCallback(run=neptune_run)
+        callbacks = [neptune_callback]
+    else:
+        from transformers.integrations import TensorBoardCallback, is_tensorboard_available
+        if is_tensorboard_available:
+            # tensorboard --logdir=runs/
+            from torch.utils.tensorboard import SummaryWriter
+            tb_writer = SummaryWriter()
+            callbacks = [TensorBoardCallback(tb_writer=tb_writer)]
+        else:
+            callbacks = []
+    expected_steps = (train_set_size * num_epochs) // batch_size
+    if eval_steps is None and eval_epochs is None:
+        # 20 evaluations for a run
+        eval_steps = max(1, int(expected_steps / 20))
+        log("Auto set eval_steps to %s out of %s total training steps" % (eval_steps, expected_steps))
+    elif eval_steps is None and eval_epochs is not None:
+        eval_steps = max(1, int(expected_steps * eval_epochs / num_epochs))
+        log("Auto converted eval_epochs=%s to eval_steps %s"
+            " out of %s total training steps" % (eval_epochs, eval_steps, expected_steps))
+    if save_steps is None:
+        save_steps = eval_steps
+        log("Auto step save_steps to %s" % save_steps)
+    elif save_steps > eval_steps:
+        # save steps must be round multiple of eval_steps
+        save_steps0 = save_steps
+        save_steps = max(1, (save_steps//eval_steps)) * eval_steps
+        if save_steps0 != save_steps:
+            log("Auto converted save_steps from %s to %s" % (save_steps0, save_steps))
+    def compute_metrics(eval_preds):
+        # e.g. see: https://huggingface.co/docs/transformers/v4.25.1/en/tasks/translation#evaluate
+        inputs = eval_preds.inputs
+        label_ids = eval_preds.label_ids
+        predictions = eval_preds.predictions
+        #inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
+        #decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)
+        #decoded_inputs = [pred.strip() for pred in decoded_inputs]
+        label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
+        # tokenizer behavior like generate time
+        decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True,
+                                                           clean_up_tokenization_spaces=True)
+        decoded_labels = [pred.strip() for pred in decoded_labels]
+        predictions = np.argmax(predictions, -1)
+        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
+        # tokenizer behavior like generate time
+        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True,
+                                                                  clean_up_tokenization_spaces=True)
+        decoded_predictions = [pred.strip() for pred in decoded_predictions]
+        result = {}
+        for metric in metrics.values():
+            result1 = metric.compute(predictions=decoded_predictions, references=decoded_labels)
+            # get rid of lists, for precision etc., for now
+            numeric_results = {k: v for k, v in result1.items() if isinstance(v, (int, float))}
+            result.update(numeric_results)
+        return result
+    # the callback that computes metrics of interest
+    if val_metrics:
+        trainer_kwargs = dict(compute_metrics=compute_metrics)
+    else:
+        trainer_kwargs = dict()
+    trainer = transformers.Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=train_data,
+        eval_dataset=valid_data,
+        # NOTE: CausalLM is not supporting Seq2SeqTrainingArguments arguments, but not incompatible
+        args=transformers.Seq2SeqTrainingArguments(
+            per_device_train_batch_size=micro_batch_size,
+            per_device_eval_batch_size=1,
+            eval_accumulation_steps=10,
+            # predict_with_generate=True,  # SEQ2SEQ only
+            include_inputs_for_metrics=True,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            warmup_steps=warmup_steps,
+            num_train_epochs=num_epochs,
+            learning_rate=learning_rate,
+            gradient_checkpointing=gradient_checkpointing,
+            fp16=fp16,
+            # cosnider 8-bit adam: https://huggingface.co/docs/transformers/v4.18.0/en/performance#8bit-adam
+            optim="adamw_torch",  # consider "adafactor" to save memory
+            logging_steps=logging_steps,
+            logging_strategy="steps",
+            evaluation_strategy="steps" if val_set_size > 0 else "no",
+            save_strategy="steps",
+            eval_steps=eval_steps if val_set_size > 0 else None,
+            save_steps=save_steps,
+            output_dir=output_dir,
+            save_total_limit=3,
+            load_best_model_at_end=True if val_set_size > 0 else False,
+            ddp_find_unused_parameters=False if ddp else None,
+            group_by_length=group_by_length,
+            #fsdp="shard_grad_op auto_wrap" if gpus > 1 and not ddp else None,
+            #fsdp_min_num_params=20000 if gpus > 1 and not ddp else None,
+            report_to='tensorboard' if not neptune_run else 'neptune',
+        ),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
+        callbacks=callbacks,
+        **trainer_kwargs,
+    )
+    model.config.use_cache = False
+    old_state_dict = model.state_dict
+    model.state_dict = (
+        lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
+    ).__get__(model, type(model))
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+        # WIP (not generally replacing layers until pytorch 2.1)
+        torch.backends.cuda.enable_flash_sdp(True)
+    if gpus > 1 and not ddp:
+        assert trainer.is_model_parallel
+    else:
+        assert not trainer.is_model_parallel
+    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    model.save_pretrained(output_dir)
+    log("\n If there's a warning about missing keys above, please disregard :)")
+def get_loaders(llama_type, model_name, reward_type):
+    # NOTE: Some models need specific new prompt_type
+    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
+    if llama_type:
+        from transformers import LlamaForCausalLM, LlamaTokenizer
+        model_loader = LlamaForCausalLM
+        tokenizer_loader = LlamaTokenizer
+    elif 'gpt2' in model_name.lower():
+        from transformers import GPT2LMHeadModel, GPT2Tokenizer
+        return GPT2LMHeadModel, GPT2Tokenizer
+    elif 'mbart-' in model_name.lower():
+        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+        return MBartForConditionalGeneration, MBart50TokenizerFast
+    elif 't5' == model_name.lower() or \
+         't5-' in model_name.lower() or \
+         'flan-' in model_name.lower():
+        from transformers import AutoTokenizer, T5ForConditionalGeneration
+        return T5ForConditionalGeneration, AutoTokenizer
+    elif 'bigbird' in model_name:
+        from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
+        return BigBirdPegasusForConditionalGeneration, AutoTokenizer
+    elif 'bart-large-cnn-samsum' in model_name or 'flan-t5-base-samsum' in model_name:
+        from transformers import pipeline
+        return pipeline, "summarization"
+    elif reward_type or 'OpenAssistant/reward-model'.lower() in model_name.lower():
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        return AutoModelForSequenceClassification, AutoTokenizer
+    else:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        model_loader = AutoModelForCausalLM
+        tokenizer_loader = AutoTokenizer
+    return model_loader, tokenizer_loader
+def get_githash():
+    try:
+        githash = subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout.decode('utf-8')[0:-1]
+    except:
+        githash = ''
+    return githash
+def copy_code(run_id):
+    """
+    copy code to track changes
+    :param run_id:
+    :return:
+    """
+    rnd_num = str(random.randint(0, 2 ** 31))
+    run_id = 'run_' + str(run_id)
+    os.makedirs(run_id, exist_ok=True)
+    me_full = os.path.join(pathlib.Path(__file__).parent.resolve(), __file__)
+    me_file = os.path.basename(__file__)
+    new_me = os.path.join(run_id, me_file + '_' + get_githash())
+    if os.path.isfile(new_me):
+        new_me = os.path.join(run_id, me_file + '_' + get_githash() + '_' + rnd_num)
+        shutil.copy(me_full, new_me)
+    else:
+        shutil.copy(me_full, new_me)
+def get_prompt(prompt_type, chat, context, reduced):
+    if prompt_type in [-1, "-1", "plain"]:
+        promptA = promptB = PreInstruct = PreInput = PreResponse = ''
+        terminate_response = []
+    elif prompt_type == 'simple_instruct':
+        promptA = promptB = PreInstruct = PreInput = PreResponse = None
+        terminate_response = []
+    elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
+        promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
+        promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        if prompt_type in [7, "7", "instruct_with_end"]:
+            terminate_response = ['### End']
+        else:
+            terminate_response = None
+    elif prompt_type in [1, "1", "quality"]:
+        promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (chat and reduced) else ''
+        promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        terminate_response = None
+    elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
+        if reduced or context or prompt_type in [2, "2", "human_bot"]:
+            preprompt = ''
+        else:
+            cur_date = time.strftime('%Y-%m-%d')
+            cur_time = time.strftime('%H:%M:%S %p %Z')
+            PRE_PROMPT = """\
+Current Date: {}
+Current Time: {}
+"""
+            preprompt = PRE_PROMPT.format(cur_date, cur_time)
+        start = human
+        promptB = promptA = '%s%s ' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = bot
+        terminate_response = [start, PreResponse]
+    elif prompt_type in [3, "3", "dai_faq"]:
+        promptA = ''
+        promptB = 'Answer the following Driverless AI question.\n'
+        PreInstruct = """
+### Driverless AI frequently asked question:
+"""
+        PreInput = None
+        PreResponse = """
+### Driverless AI documentation answer:
+"""
+        terminate_response = ['\n\n']
+    elif prompt_type in [5, "5", "summarize"]:
+        promptA = promptB = PreInput = ''
+        PreInstruct = '## Main Text\n\n'
+        PreResponse = '\n\n## Summary\n\n'
+        terminate_response = None
+    elif prompt_type in [6, "6", "instruct_vicuna"]:
+        promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
+            "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (chat and reduced) else ''
+        PreInstruct = """
+### Human:
+"""
+        PreInput = None
+        PreResponse = """
+### Assistant:
+"""
+        terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+    else:
+        raise RuntimeError("No such prompt_type=%s" % prompt_type)
+    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response
+def generate_prompt(data_point, prompt_type, chat, reduced):
+    context = data_point.get('context')
+    if context is None:
+        context = ''
+    instruction = data_point.get('instruction')
+    input = data_point.get('input')
+    output = data_point.get('output')
+    prompt_type = data_point.get('prompt_type', prompt_type)
+    assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
+    promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response = get_prompt(prompt_type, chat, context, reduced)
+    prompt = context
+    if input and promptA:
+        prompt += f"""{promptA}"""
+    elif promptB:
+        prompt += f"""{promptB}"""
+    if instruction and PreInstruct is not None and input and PreInput is not None:
+        prompt += f"""{PreInstruct}{instruction}{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and input and PreInstruct is None and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is None and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and PreInput is not None:
+        prompt += f"""{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction:
+        # i.e. for simple_instruct
+        prompt += f"""{instruction}: {input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input:
+        prompt += f"""{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction:
+        prompt += f"""{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    if PreResponse is not None:
+        prompt += f"""{PreResponse}"""
+        pre_response = PreResponse  # Don't use strip
+    else:
+        pre_response = ''
+    if output:
+        prompt += f"""{output}"""
+    return prompt, pre_response, terminate_response
+def inject_newline(prompt_type, prompt):
+    if prompt_type not in [-1, '-1', 'plain', 'simple_instruct']:
+        # only add new line if structured prompt, while 'plain' is just generation of next tokens from input
+        prompt += '\n'
+    return prompt
+example_data_point0 = dict(instruction="Summarize",
+                           input="Ducks eat seeds by the lake, then swim in the lake where fish eat small animals.",
+                           output="Ducks eat and swim at the lake.")
+example_data_point1 = dict(instruction="Who is smarter, Einstein or Newton?",
+                           output="Einstein.")
+example_data_point2 = dict(input="Who is smarter, Einstein or Newton?",
+                           output="Einstein.")
+example_data_points = [example_data_point0, example_data_point1, example_data_point2]
+def test_train_prompt(prompt_type='instruct', data_point=0):
+    example_data_point = example_data_points[data_point]
+    return generate_prompt(example_data_point, prompt_type, False, False)
+def test_debug():
+    fire.Fire(train)
+if __name__ == "__main__":
+    CONFIG = "NCCL_P2P_LEVEL=LOC WORLD_SIZE=5 torchrun --nnodes=5 --master_addr=10.10.10.2 --master_port=1111 --nproc_per_node=1"
+    CMD = "finetune.py --data_path=config.json --num_epochs=1 --base_model=decapoda-research/llama-13b-hf"
+    log(f"""
+    Example runs on 4 GPUs:
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='decapoda-research/llama-7b-hf' --data_path=data/config.json --run_id=0 &> 0.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='decapoda-research/llama-30b-hf' --data_path=data/config.json --batch_size=16 --micro_batch_size=1 --run_id=1 --save_code=True &> 1.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=data/config.json --run_id=2 &> 2.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='EleutherAI/gpt-neox-20b' --data_path=data/config.json --run_id=8 --batch_size=16 --micro_batch_size=4 &> 8.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --data_path=data/config.json --prompt_type='dai_faq' --run_id=13 --batch_size=16 --micro_batch_size=4 --num_epochs=100 --val_set_size=0 data_mix_in_path='' &> 13.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --data_path=data/config.json --run_id=28 --batch_size=16 --micro_batch_size=4 --num_epochs=8 --val_set_size=0 --data_mix_in_factor=0.1 --data_mix_in_prompt_type='human_bot' --save_code=True --cutoff_len=512  &> 28.log
+    All metrics:
+    CUDA_VISIBLE_DEVICES= finetune.py --data_mix_in_factor=0 --eval_steps=100 --warmup_steps=2 --val_set_size=100 --val_metrics="['bleu', 'rouge', 'sacrebleu', 'meteor']"
+    # Fine-tune 20B on 24GB GPUs across 3 nodes with 3+2+2 GPUs
+    rippa>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1,2" torchrun --node_rank 0 --nproc_per_node=3 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank0
+    ova>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1" torchrun --node_rank 1 --nproc_per_node=2 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank1
+    timemachine>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1" torchrun --node_rank 2 --nproc_per_node=2 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank2
+    """, flush=True)
+    if os.environ.get("LOCAL_RANK") is None:
+        # then not using torchrun, so can't do distributed, ensure CVD set
+        assert os.environ.get("CUDA_VISIBLE_DEVICES") is not None, "Run python script using: torchrun finetune.py OR set CUDA_VISIBLE_DEVICES to single GPU"
+    fire.Fire(train)

h2o-logo.svg ADDED Viewed

prompter.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from finetune import generate_prompt
+class Prompter(object):
+    def __init__(self, prompt_type, debug=False, chat=False, stream_output=False, repeat_penalty=True,
+                 allowed_repeat_line_length=10):
+        self.prompt_type = prompt_type
+        data_point = dict(instruction='', input='', output='')
+        _, self.pre_response, self.terminate_response = generate_prompt(data_point, prompt_type, chat, False)
+        self.debug = debug
+        self.chat = chat
+        self.stream_output = stream_output
+        self.repeat_penalty = repeat_penalty
+        self.allowed_repeat_line_length = allowed_repeat_line_length
+    def generate_prompt(self, data_point):
+        reduced = False
+        prompt, _, _ = generate_prompt(data_point, self.prompt_type, self.chat, reduced)
+        if self.debug:
+            print("prompt: ", prompt, flush=True)
+        self.prompt = prompt
+        return prompt
+    def get_response(self, outputs, prompt=None, sanitize_bot_response=True):
+        if isinstance(outputs, str):
+            outputs = [outputs]
+        if self.debug:
+            print("output: ", '\n\n'.join(outputs), flush=True)
+        if prompt is not None:
+            self.prompt = prompt
+        def clean_response(response):
+            meaningless_words = ['<pad>', '</s>', '<|endoftext|>', '”\n']
+            for word in meaningless_words:
+                response = response.replace(word, "")
+            if sanitize_bot_response:
+                from better_profanity import profanity
+                response = profanity.censor(response)
+            response = response.strip("\n")
+            return response
+        def clean_repeats(response):
+            lines = response.split('\n')
+            new_lines = []
+            [new_lines.append(line) for line in lines if
+             line not in new_lines or len(line) < self.allowed_repeat_line_length]
+            if self.debug and len(lines) != len(new_lines):
+                print("cleaned repeats: %s %s" % (len(lines), len(new_lines)), flush=True)
+            response = '\n'.join(new_lines)
+            return response
+        multi_output = len(outputs) > 1
+        for oi, output in enumerate(outputs):
+            if self.prompt_type in [0, '0', 'plain']:
+                output = clean_response(output)
+            else:
+                # find first instance of prereponse
+                # prompt sometimes has odd characters, that mutate length,
+                # so can't go by length alone
+                if self.pre_response:
+                    outputi = output.find(prompt)
+                    if outputi >= 0:
+                        output = output[outputi + len(prompt):]
+                        allow_terminate = True
+                    else:
+                        # subtraction is risky due to space offsets sometimes, so only do if necessary
+                        output = output[len(prompt) - len(self.pre_response):]
+                        # [1] to avoid repeated pre_response, just take first (after prompt - pre_response for chat)
+                        if self.pre_response in output:
+                            output = output.split(self.pre_response)[1]
+                            allow_terminate = True
+                        else:
+                            print("Failure of parsing: %s" % output, flush=True)
+                            allow_terminate = False
+                else:
+                    allow_terminate = True
+                    output = output[len(prompt):]
+                # clean after subtract prompt out, so correct removal of pre_response
+                output = clean_response(output).strip()
+                if self.repeat_penalty:
+                    output = clean_repeats(output).strip()
+                if self.terminate_response and allow_terminate:
+                    finds = []
+                    for term in self.terminate_response:
+                        finds.append(output.find(term))
+                    finds = [x for x in finds if x >= 0]
+                    if len(finds) > 0:
+                        termi = finds[0]
+                        output = output[:termi].strip()
+                    else:
+                        output = output.strip()
+                else:
+                    output = output.strip()
+            if multi_output:
+                # prefix with output counter
+                output = "\n=========== Output %d\n\n" % (1 + oi) + output
+                if oi > 0:
+                    # post fix outputs with seperator
+                    output += '\n'
+            outputs[oi] = output
+        # join all outputs, only one extra new line between outputs
+        output = '\n'.join(outputs)
+        if self.debug:
+            print("outputclean: ", '\n\n'.join(outputs), flush=True)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+# for generate (gradio server) and finetune
+datasets==2.10.1
+sentencepiece==0.1.97
+accelerate==0.18.0
+gradio==3.27.0
+huggingface_hub==0.13.4
+appdirs==1.4.4
+fire==0.5.0
+docutils==0.19
+torch==2.0.0
+evaluate==0.4.0
+rouge_score==0.1.2
+sacrebleu==2.3.1
+scikit-learn==1.2.2
+alt-profanity-check==1.2.2
+better-profanity==0.6.1
+numpy==1.24.2
+pandas==1.5.3
+matplotlib==3.7.1
+loralib==0.1.1
+bitsandbytes==0.38.1
+git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1c373
+transformers==4.28.1
+tokenizers==0.13.3
+# optional for generate
+pynvml==11.5.0
+psutil==5.9.4
+# optional for finetune
+tensorboard==2.12.1
+neptune==1.1.1
+# for gradio client
+gradio_client==0.1.3
+beautifulsoup4==4.12.2
+markdown==3.4.1
+# data and testing
+pytest==7.2.2
+pytest-xdist==3.2.1
+nltk==3.8.1
+textstat==0.7.3
+pandoc==2.3
+pypandoc==1.11
+openpyxl==3.1.2
+lm_dataformat==0.0.20
+bioc==2.0

stopping.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import traceback
+from queue import Queue
+from threading import Thread
+import collections.abc
+import torch
+from transformers import StoppingCriteria
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=[]):
+        super().__init__()
+        assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
+        self.encounters = encounters
+        self.stops = [stop.to("cuda") for stop in stops]
+        self.num_stops = [0] * len(stops)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stopi, stop in enumerate(self.stops):
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                self.num_stops[stopi] += 1
+                if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
+                    return True
+        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
+        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
+        return False
+class Stream(StoppingCriteria):
+    """
+    This class can be used to callback during generation. Keep
+    in mind for decoder-only type of transformers, this will include the initial prompted tokens.
+    Args:
+        func (`callable`):
+            A callable function to apply on first input in list every iteration of generation
+    """
+    def __init__(self, func=None):
+        self.func = func
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.func is not None:
+            # only consume first of multiple responses
+            self.func(input_ids[0])
+        return False
+class CallbackToGenerator(collections.abc.Generator):
+    """
+    A generator wrapper for a function that invokes a callback multiple times.
+    Calling `send` on the generator emits a value from one callback, and returns
+    the next.
+    Note this starts a background thread
+    """
+    def __init__(self, func, *args, callback=None, **kwargs):
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+        self.callback = callback
+        self._ready_queue = Queue(1)
+        self._done_queue = Queue(1)
+        self._done_holder = [False]
+        # local to avoid reference cycles
+        ready_queue = self._ready_queue
+        done_queue = self._done_queue
+        done_holder = self._done_holder
+        def val_callback(value):
+            done_queue.put((False, value))
+            cmd, val = ready_queue.get()
+            if cmd == 'send':
+                return val
+            elif cmd == 'throw':
+                raise val
+            else:
+                assert False  # pragma: no cover
+        def thread_func():
+            while True:
+                cmd, val = ready_queue.get()
+                if cmd == 'send' and val is not None:
+                    done_queue.put((True, TypeError("can't send non-None value to a just-started generator")))
+                    continue
+                break
+            try:
+                if cmd == 'throw':
+                    raise val
+                ret = func(callback=val_callback, **self.kwargs)
+                raise StopIteration(ret) if ret is not None else StopIteration
+            except BaseException as e:
+                done_holder[0] = True
+                done_queue.put((True, e))
+        self._thread = Thread(target=thread_func)
+        self._thread.start()
+    def _put(self, *args):
+        if self._done_holder[0]:
+            raise StopIteration
+        self._ready_queue.put(args)
+        is_exception, val = self._done_queue.get()
+        if is_exception:
+            try:
+                raise val
+            finally:
+                # prevent val's traceback containing a reference cycle
+                del val
+        else:
+            return val
+    def send(self, value):
+        return self._put('send', value)
+    def throw(self, exc):
+        return self._put('throw', exc)
+    def close(self):
+        try:
+            self.throw(GeneratorExit)
+        except StopIteration:
+            self._thread.join()
+        except GeneratorExit:
+            self._thread.join()
+        except BaseException:
+            self._thread.join()
+            raise
+        else:
+            # yielded again, can't clean up the thread
+            raise RuntimeError('Task with callback ignored GeneratorExit')
+    def __del__(self):
+        self.close()

utils.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import gc
+import random
+import time
+import traceback
+import zipfile
+from datetime import datetime
+import filelock
+import numpy as np
+import pandas as pd
+import torch
+def set_seed(seed: int):
+    """
+    Sets the seed of the entire notebook so results are the same every time we run.
+    This is for REPRODUCIBILITY.
+    """
+    np.random.seed(seed)
+    random_state = np.random.RandomState(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    return random_state
+def flatten_list(lis):
+    """Given a list, possibly nested to any level, return it flattened."""
+    new_lis = []
+    for item in lis:
+        if type(item) == type([]):
+            new_lis.extend(flatten_list(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+def clear_torch_cache():
+    if torch.cuda.is_available:
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        gc.collect()
+def system_info():
+    import psutil
+    system = {}
+    # https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
+    # https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
+    temps = psutil.sensors_temperatures(fahrenheit=False)
+    if 'coretemp' in temps:
+        coretemp = temps['coretemp']
+        temp_dict = {k.label: k.current for k in coretemp}
+        for k, v in temp_dict.items():
+            system['CPU_C/%s' % k] = v
+    # https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
+    from pynvml.smi import nvidia_smi
+    nvsmi = nvidia_smi.getInstance()
+    gpu_power_dict = {'W_gpu%d' % i: x['power_readings']['power_draw'] for i, x in
+                      enumerate(nvsmi.DeviceQuery('power.draw')['gpu'])}
+    for k, v in gpu_power_dict.items():
+        system['GPU_W/%s' % k] = v
+    gpu_temp_dict = {'C_gpu%d' % i: x['temperature']['gpu_temp'] for i, x in
+                     enumerate(nvsmi.DeviceQuery('temperature.gpu')['gpu'])}
+    for k, v in gpu_temp_dict.items():
+        system['GPU_C/%s' % k] = v
+    gpu_memory_free_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['free'] for i, x in
+                            enumerate(nvsmi.DeviceQuery('memory.free')['gpu'])}
+    gpu_memory_total_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['total'] for i, x in
+                             enumerate(nvsmi.DeviceQuery('memory.total')['gpu'])}
+    gpu_memory_frac_dict = {k: gpu_memory_free_dict[k] / gpu_memory_total_dict[k] for k in gpu_memory_total_dict}
+    for k, v in gpu_memory_frac_dict.items():
+        system[f'GPU_M/%s' % k] = v
+    return system
+def system_info_print():
+    try:
+        df = pd.DataFrame.from_dict(system_info(), orient='index')
+        # avoid slamming GPUs
+        time.sleep(1)
+        return df.to_markdown()
+    except Exception as e:
+        return "Error: %s" % str(e)
+def zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    try:
+        return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in zipping: %s' % str(e))
+def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    if zip_file is None:
+        datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
+        host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
+        zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
+    assert root_dirs is not None
+    with zipfile.ZipFile(zip_file, "w") as expt_zip:
+        for root_dir in root_dirs:
+            if root_dir is None:
+                continue
+            for root, d, files in os.walk(root_dir):
+                for file in files:
+                    file_to_archive = os.path.join(root, file)
+                    assert os.path.exists(file_to_archive)
+                    path_to_archive = os.path.relpath(file_to_archive, base_dir)
+                    expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
+    return zip_file
+def save_generate_output(output=None, base_model=None, save_dir=None):
+    try:
+        return _save_generate_output(output=output, base_model=base_model, save_dir=save_dir)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in saving: %s' % str(e))
+def _save_generate_output(output=None, base_model=None, save_dir=None):
+    """
+    Save conversation to .json, row by row.
+    json_file_path is path to final JSON file. If not in ., then will attempt to make directories.
+    Appends if file exists
+    """
+    assert save_dir, "save_dir must be provided"
+    if os.path.exists(save_dir) and not os.path.isdir(save_dir):
+        raise RuntimeError("save_dir already exists and is not a directory!")
+    os.makedirs(save_dir, exist_ok=True)
+    import json
+    if output[-10:] == '\n\n<human>:':
+        # remove trailing <human>:
+        output = output[:-10]
+    with filelock.FileLock("save_dir.lock"):
+        # lock logging in case have concurrency
+        with open(os.path.join(save_dir, "history.json"), "a") as f:
+            # just add [ at start, and ] at end, and have proper JSON dataset
+            f.write(
+                "  " + json.dumps(
+                    dict(text=output, time=time.ctime(), base_model=base_model)
+                ) + ",\n"
+            )