Spaces:

tencent
/

SongGeneration

Running on L40S

App Files Files Community

root commited on Jun 27

Commit

93f7efb

1 Parent(s): 1be5a09

fix some typo

Browse files

Files changed (34) hide show

LICENSE +211 -211
README.md +0 -2
app.py +3 -3
codeclm/models/codeclm.py +40 -53
codeclm/tokenizer/Flow1dVAE/generate_septoken.py +3 -2
codeclm/tokenizer/Flow1dVAE/libs/rvq/core_vq.py +366 -366
codeclm/tokenizer/Flow1dVAE/model_1rvq.py +710 -710
codeclm/tokenizer/Flow1dVAE/model_2rvq.py +774 -774
codeclm/tokenizer/Flow1dVAE/model_4rvq.py +774 -774
codeclm/tokenizer/Flow1dVAE/model_septoken.py +670 -670
codeclm/tokenizer/Flow1dVAE/models/unet_2d_condition_additionalemb.py +0 -0
codeclm/tokenizer/Flow1dVAE/models/unet_2d_condition_flow.py +0 -0
codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py +71 -71
codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k.py +47 -47
codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k_vocal.py +47 -47
codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_speech.py +56 -56
codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_vocal.py +57 -57
codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k.py +59 -59
codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_soundmusic.py +61 -61
codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_speech.py +58 -58
codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_vocal.py +59 -59
codeclm/tokenizer/Flow1dVAE/tools/mix.py +50 -50
codeclm/tokenizer/Flow1dVAE/tools/torch_tools.py +142 -142
codeclm/tokenizer/audio_tokenizer.py +2 -2
generate_lowmem.py +240 -0
generate_lowmem.sh +10 -0
requirements.txt +24 -0
requirements_nodeps.txt +13 -0
sample/lyrics.jsonl +1 -1
tools/gradio/app.py +236 -0
tools/gradio/levo_inference.py +110 -0
tools/gradio/levo_inference_lowmem.py +129 -0
tools/gradio/run.sh +9 -0
tools/gradio/separator.py +50 -0

LICENSE CHANGED Viewed

@@ -1,211 +1,211 @@
-Tencent is pleased to support the open source community by making SongGeneration available.
-Copyright (C) 2025 Tencent.  All rights reserved.
-SongGeneration is licensed under the License Terms of SongGeneration except for the third-party components listed below, which is licensed under different terms.  SongGeneration does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
-License Terms of SongGeneration:
---------------------------------------------------------------------
-Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-- You agree to use the SongGeneration only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
-- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-For avoidance of doubts, "Software" means the SongGeneration inference-enabling code and the weights made available under this license excluding any pre-trained data and other AI components.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-Other dependencies and licenses:
-Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
---------------------------------------------------------------------
-1. stable_audio_tools
-Copyright (c) 2023 Stability AI
-Terms of the MIT:
---------------------------------------------------------------------
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-For the license of other third party components, please refer to the following URL:
-https://github.com/Stability-AI/stable-audio-tools/tree/main/LICENSES
-Open Source Software Licensed under the MIT License:
---------------------------------------------------------------------
-1. demucs
-Copyright (c) Meta Platforms, Inc. and affiliates.
-A copy of the MIT is included in this file.
-Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
---------------------------------------------------------------------
-1. torch
-From PyTorch:
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-From Caffe2:
-Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-All contributions by Facebook:
-Copyright (c) 2016 Facebook Inc.
-All contributions by Google:
-Copyright (c) 2015 Google Inc.
-All rights reserved.
-All contributions by Yangqing Jia:
-Copyright (c) 2015 Yangqing Jia
-All rights reserved.
-All contributions by Kakao Brain:
-Copyright 2019-2020 Kakao Brain
-All contributions by Cruise LLC:
-Copyright (c) 2022 Cruise LLC.
-All rights reserved.
-All contributions from Caffe:
-Copyright(c) 2013, 2014, 2015, the respective contributors
-All rights reserved.
-All other contributions:
-Copyright(c) 2015, 2016 the respective contributors
-All rights reserved.
-Caffe2 uses a copyright model similar to Caffe: each contributor holds
-copyright over their contributions to Caffe2. The project versioning records
-all such contribution and copyright details. If a contributor wants to further
-mark their specific copyright on a particular contribution, they should
-indicate their copyright solely in the commit message of the change when it is
-committed.
-All rights reserved.
-Terms of the BSD 3-Clause:
---------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-For the license of other third party components, please refer to the following URL:
-https://github.com/pytorch/pytorch/blob/v2.0.1/NOTICE
-Open Source Software Licensed under the BSD 2-Clause License and Other Licenses of the Third-Party Components therein:
---------------------------------------------------------------------
-1. torchaudio
-Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-All rights reserved.
-Terms of the BSD 2-Clause:
---------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-For the license of other third party components, please refer to the following URL:
-https://github.com/pytorch/audio/blob/v2.0.2/LICENSE
-Open Source Software License under the Apache License Version 2.0:
---------------------------------------------------------------------
-1. huggingface-hub
-Copyright (c) huggingface-hub original author and authors
-2. transformers
-Copyright 2018- The Hugging Face team. All rights reserved.
-Terms of the Apache License Version 2.0:
---------------------------------------------------------------------
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-1. Definitions.
-"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
-"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
-"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
-"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
-"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
-"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
-"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
-"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
-"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
-"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
-2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
-3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
-4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
-You must give any other recipients of the Work or Derivative Works a copy of this License; and
-You must cause any modified files to carry prominent notices stating that You changed the files; and
-You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
-If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
-You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
-5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
-6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
-7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
-8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
-9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
-END OF TERMS AND CONDITIONS

+Tencent is pleased to support the open source community by making SongGeneration available.
+Copyright (C) 2025 Tencent.  All rights reserved.
+SongGeneration is licensed under the License Terms of SongGeneration except for the third-party components listed below, which is licensed under different terms.  SongGeneration does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+License Terms of SongGeneration:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+- You agree to use the SongGeneration only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+For avoidance of doubts, "Software" means the SongGeneration inference-enabling code and the weights made available under this license excluding any pre-trained data and other AI components.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Other dependencies and licenses:
+Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. stable_audio_tools
+Copyright (c) 2023 Stability AI
+Terms of the MIT:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/Stability-AI/stable-audio-tools/tree/main/LICENSES
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. demucs
+Copyright (c) Meta Platforms, Inc. and affiliates.
+A copy of the MIT is included in this file.
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. torch
+From PyTorch:
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+From Caffe2:
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+All rights reserved.
+Terms of the BSD 3-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/pytorch/blob/v2.0.1/NOTICE
+Open Source Software Licensed under the BSD 2-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. torchaudio
+Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
+All rights reserved.
+Terms of the BSD 2-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/audio/blob/v2.0.2/LICENSE
+Open Source Software License under the Apache License Version 2.0:
+--------------------------------------------------------------------
+1. huggingface-hub
+Copyright (c) huggingface-hub original author and authors
+2. transformers
+Copyright 2018- The Hugging Face team. All rights reserved.
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS

README.md CHANGED Viewed

@@ -7,11 +7,9 @@ sdk: docker
 app_port: 7860
 ---
 <p align="center">
     <a href="https://levo-demo.github.io/">Demo</a> &nbsp;|&nbsp; <a href="https://arxiv.org/abs/2506.07520">Paper</a>  &nbsp;|&nbsp; <a href="https://github.com/tencent-ailab/songgeneration">Code</a>
 </p>
 This repository is the official weight repository for LeVo: High-Quality Song Generation with Multi-Preference Alignment. In this repository, we provide the SongGeneration model, inference scripts, and the checkpoint that has been trained on the Million Song Dataset.
 ## Overview

 app_port: 7860
 ---
 <p align="center">
     <a href="https://levo-demo.github.io/">Demo</a> &nbsp;|&nbsp; <a href="https://arxiv.org/abs/2506.07520">Paper</a>  &nbsp;|&nbsp; <a href="https://github.com/tencent-ailab/songgeneration">Code</a>
 </p>
 This repository is the official weight repository for LeVo: High-Quality Song Generation with Multi-Preference Alignment. In this repository, we provide the SongGeneration model, inference scripts, and the checkpoint that has been trained on the Million Song Dataset.
 ## Overview

app.py CHANGED Viewed

@@ -124,9 +124,9 @@ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_co
 # 创建Gradio界面
-with gr.Blocks(title="SongGeration Demo Space") as demo:
-    gr.Markdown("# 🎵 SongGeration Demo Space")
-    gr.Markdown("Demo interface for the song generation model. Provide a lyrics, and optionally an audio or text prompt, to generate a custom song.")
     with gr.Row():
         with gr.Column():

 # 创建Gradio界面
+with gr.Blocks(title="SongGeneration Demo Space") as demo:
+    gr.Markdown("# 🎵 SongGeneration Demo Space")
+    gr.Markdown("Demo interface for the song generation model. Provide a lyrics, and optionally an audio or text prompt, to generate a custom song. The code is in [GIT](https://github.com/tencent-ailab/SongGeneration)")
     with gr.Row():
         with gr.Column():

codeclm/models/codeclm.py CHANGED Viewed

@@ -36,6 +36,10 @@ class CodecLM:
                  max_duration: tp.Optional[float] = None, seperate_tokenizer: AudioTokenizer = None):
         self.name = name
         self.audiotokenizer = audiotokenizer
         self.lm = lm
         self.seperate_tokenizer = seperate_tokenizer
         # import pdb; pdb.set_trace()
@@ -47,7 +51,7 @@ class CodecLM:
         assert max_duration is not None
         self.max_duration: float = max_duration
-        self.device = next(iter(lm.parameters())).device
         self.generation_params: dict = {}
         # self.set_generation_params(duration=15)  # 15 seconds by default
         self.set_generation_params(duration=15, extend_stride=self.max_duration // 2)
@@ -57,23 +61,6 @@ class CodecLM:
         else:
             self.autocast = TorchAutocast(enabled=False)
-    @property
-    def frame_rate(self) -> float:
-        """Roughly the number of AR steps per seconds."""
-        return self.audiotokenizer.frame_rate
-    @property
-    def sample_rate(self) -> int:
-        """Sample rate of the generated audio."""
-        return self.audiotokenizer.sample_rate
-    @property
-    def audio_channels(self) -> int:
-        """Audio channels of the generated audio."""
-        return self.audiotokenizer.channels
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
@@ -185,7 +172,7 @@ class CodecLM:
         assert len(lyrics) == 1
         texts = [lyric for lyric in lyrics]
         audio_qt_embs = []
-        target_melody_token_len = self.lm.cfg.prompt_len * self.audiotokenizer.frame_rate
         # import pdb; pdb.set_trace()
         if melody_wavs is None:
             melody_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
@@ -207,39 +194,39 @@ class CodecLM:
                 melody_tokens = melody_tokens[...,:target_melody_token_len]
             elif melody_tokens.shape[-1] < target_melody_token_len:
                 melody_tokens = torch.cat([melody_tokens, torch.full((1,1,target_melody_token_len - melody_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
-        if self.seperate_tokenizer is not None:
-            if bgm_wavs is None:
-                assert vocal_wavs is None, "vocal_wavs is not None when bgm_wavs is None"
-                bgm_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
-                vocal_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
             else:
-                assert vocal_wavs is not None, "vocal_wavs is None when bgm_wavs is not None"
-                if type(vocal_wavs) == list:
-                    vocal_wavs = torch.stack(vocal_wavs, dim=0)
-                if type(bgm_wavs) == list:
-                    bgm_wavs = torch.stack(bgm_wavs, dim=0)
-                vocal_wavs = vocal_wavs.to(self.device)
-                bgm_wavs = bgm_wavs.to(self.device)
-                if melody_is_wav:
-                    vocal_tokens, bgm_tokens = self.seperate_tokenizer.encode(vocal_wavs, bgm_wavs)
-                else:
-                    vocal_tokens = vocal_wavs
-                    bgm_tokens = bgm_wavs
-                assert len(vocal_tokens.shape) == len(bgm_tokens.shape) == 3, \
-                    f"vocal and bgm tokens should have a shape [B, C, T]! " \
-                    f"got vocal len={vocal_tokens.shape}, and bgm len={bgm_tokens.shape}"
-                assert vocal_tokens.shape[-1] == bgm_tokens.shape[-1], \
-                    f"vocal and bgm tokens should have the same length! " \
-                    f"got vocal len={vocal_tokens.shape[-1]}, and bgm len={bgm_tokens.shape[-1]}"
-                if bgm_tokens.shape[-1] > target_melody_token_len:
-                    bgm_tokens = bgm_tokens[...,:target_melody_token_len]
-                elif bgm_tokens.shape[-1] < target_melody_token_len:
-                    bgm_tokens = torch.cat([bgm_tokens, torch.full((1,1,target_melody_token_len - bgm_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
-                if vocal_tokens.shape[-1] > target_melody_token_len:
-                    vocal_tokens = vocal_tokens[...,:target_melody_token_len]
-                elif vocal_tokens.shape[-1] < target_melody_token_len:
-                    vocal_tokens = torch.cat([vocal_tokens, torch.full((1,1,target_melody_token_len - vocal_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
-            melody_tokens = torch.cat([melody_tokens, vocal_tokens, bgm_tokens], dim=1)
         assert melody_tokens.shape[-1] == target_melody_token_len
         audio_qt_embs = melody_tokens.long()
         return texts, audio_qt_embs
@@ -284,7 +271,7 @@ class CodecLM:
         return gen_tokens
     @torch.no_grad()
-    def generate_audio(self, gen_tokens: torch.Tensor, prompt=None, vocal_prompt=None, bgm_prompt=None):
         """Generate Audio from tokens"""
         assert gen_tokens.dim() == 3
         if self.seperate_tokenizer is not None:
@@ -292,7 +279,7 @@ class CodecLM:
             gen_tokens_vocal = gen_tokens[:, [1], :]
             gen_tokens_bgm = gen_tokens[:, [2], :]
             # gen_audio_song = self.audiotokenizer.decode(gen_tokens_song, prompt)
-            gen_audio_seperate = self.seperate_tokenizer.decode([gen_tokens_vocal, gen_tokens_bgm], vocal_prompt, bgm_prompt)
             return gen_audio_seperate
         else:
             gen_audio = self.audiotokenizer.decode(gen_tokens, prompt)

                  max_duration: tp.Optional[float] = None, seperate_tokenizer: AudioTokenizer = None):
         self.name = name
         self.audiotokenizer = audiotokenizer
+        if self.audiotokenizer:
+            self.frame_rate = self.audiotokenizer.frame_rate
+        else:
+            self.frame_rate = 25
         self.lm = lm
         self.seperate_tokenizer = seperate_tokenizer
         # import pdb; pdb.set_trace()
         assert max_duration is not None
         self.max_duration: float = max_duration
+        self.device = torch.device("cuda")
         self.generation_params: dict = {}
         # self.set_generation_params(duration=15)  # 15 seconds by default
         self.set_generation_params(duration=15, extend_stride=self.max_duration // 2)
         else:
             self.autocast = TorchAutocast(enabled=False)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
         assert len(lyrics) == 1
         texts = [lyric for lyric in lyrics]
         audio_qt_embs = []
+        target_melody_token_len = self.lm.cfg.prompt_len * self.frame_rate
         # import pdb; pdb.set_trace()
         if melody_wavs is None:
             melody_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
                 melody_tokens = melody_tokens[...,:target_melody_token_len]
             elif melody_tokens.shape[-1] < target_melody_token_len:
                 melody_tokens = torch.cat([melody_tokens, torch.full((1,1,target_melody_token_len - melody_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+        if bgm_wavs is None:
+            assert vocal_wavs is None, "vocal_wavs is not None when bgm_wavs is None"
+            bgm_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
+            vocal_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
+        else:
+            assert vocal_wavs is not None, "vocal_wavs is None when bgm_wavs is not None"
+            if type(vocal_wavs) == list:
+                vocal_wavs = torch.stack(vocal_wavs, dim=0)
+            if type(bgm_wavs) == list:
+                bgm_wavs = torch.stack(bgm_wavs, dim=0)
+            vocal_wavs = vocal_wavs.to(self.device)
+            bgm_wavs = bgm_wavs.to(self.device)
+            if melody_is_wav:
+                vocal_tokens, bgm_tokens = self.seperate_tokenizer.encode(vocal_wavs, bgm_wavs)
             else:
+                vocal_tokens = vocal_wavs
+                bgm_tokens = bgm_wavs
+            assert len(vocal_tokens.shape) == len(bgm_tokens.shape) == 3, \
+                f"vocal and bgm tokens should have a shape [B, C, T]! " \
+                f"got vocal len={vocal_tokens.shape}, and bgm len={bgm_tokens.shape}"
+            assert vocal_tokens.shape[-1] == bgm_tokens.shape[-1], \
+                f"vocal and bgm tokens should have the same length! " \
+                f"got vocal len={vocal_tokens.shape[-1]}, and bgm len={bgm_tokens.shape[-1]}"
+            if bgm_tokens.shape[-1] > target_melody_token_len:
+                bgm_tokens = bgm_tokens[...,:target_melody_token_len]
+            elif bgm_tokens.shape[-1] < target_melody_token_len:
+                bgm_tokens = torch.cat([bgm_tokens, torch.full((1,1,target_melody_token_len - bgm_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+            if vocal_tokens.shape[-1] > target_melody_token_len:
+                vocal_tokens = vocal_tokens[...,:target_melody_token_len]
+            elif vocal_tokens.shape[-1] < target_melody_token_len:
+                vocal_tokens = torch.cat([vocal_tokens, torch.full((1,1,target_melody_token_len - vocal_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+        melody_tokens = torch.cat([melody_tokens, vocal_tokens, bgm_tokens], dim=1)
         assert melody_tokens.shape[-1] == target_melody_token_len
         audio_qt_embs = melody_tokens.long()
         return texts, audio_qt_embs
         return gen_tokens
     @torch.no_grad()
+    def generate_audio(self, gen_tokens: torch.Tensor, prompt=None, vocal_prompt=None, bgm_prompt=None, chunked=False):
         """Generate Audio from tokens"""
         assert gen_tokens.dim() == 3
         if self.seperate_tokenizer is not None:
             gen_tokens_vocal = gen_tokens[:, [1], :]
             gen_tokens_bgm = gen_tokens[:, [2], :]
             # gen_audio_song = self.audiotokenizer.decode(gen_tokens_song, prompt)
+            gen_audio_seperate = self.seperate_tokenizer.decode([gen_tokens_vocal, gen_tokens_bgm], vocal_prompt, bgm_prompt, chunked=chunked)
             return gen_audio_seperate
         else:
             gen_audio = self.audiotokenizer.decode(gen_tokens, prompt)

codeclm/tokenizer/Flow1dVAE/generate_septoken.py CHANGED Viewed

@@ -173,7 +173,7 @@ class Tango:
         return codes_vocal, codes_bgm
     @torch.no_grad()
-    def code2sound(self, codes, prompt_vocal=None, prompt_bgm=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
         codes_vocal,codes_bgm = codes
         codes_vocal = codes_vocal.to(self.device)
         codes_bgm = codes_bgm.to(self.device)
@@ -268,11 +268,12 @@ class Tango:
         min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
         hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
         ovlp_samples = min_samples - hop_samples
         with torch.no_grad():
             output = None
             for i in range(len(latent_list)):
                 latent = latent_list[i]
-                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
                 if output is None:
                     output = cur_output

         return codes_vocal, codes_bgm
     @torch.no_grad()
+    def code2sound(self, codes, prompt_vocal=None, prompt_bgm=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False, chunked=False):
         codes_vocal,codes_bgm = codes
         codes_vocal = codes_vocal.to(self.device)
         codes_bgm = codes_bgm.to(self.device)
         min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
         hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
         ovlp_samples = min_samples - hop_samples
+        torch.cuda.empty_cache()
         with torch.no_grad():
             output = None
             for i in range(len(latent_list)):
                 latent = latent_list[i]
+                cur_output = self.vae.decode_audio(latent, chunked=chunked)[0].detach().cpu()
                 if output is None:
                     output = cur_output

codeclm/tokenizer/Flow1dVAE/libs/rvq/core_vq.py CHANGED Viewed

@@ -1,366 +1,366 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# This implementation is inspired from
-# https://github.com/lucidrains/vector-quantize-pytorch
-# which is released under MIT License. Hereafter, the original license:
-# MIT License
-#
-# Copyright (c) 2020 Phil Wang
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""Core vector quantization implementation."""
-import typing as tp
-from einops import rearrange, repeat
-import torch
-from torch import nn
-import torch.nn.functional as F
-# from .. import distrib
-def default(val: tp.Any, d: tp.Any) -> tp.Any:
-    return val if val is not None else d
-def ema_inplace(moving_avg, new, decay: float):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
-    return (x + epsilon) / (x.sum() + n_categories * epsilon)
-def uniform_init(*shape: int):
-    t = torch.empty(shape)
-    nn.init.kaiming_uniform_(t)
-    return t
-def sample_vectors(samples, num: int):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num,), device=device)
-    return samples[indices]
-def kmeans(samples, num_clusters: int, num_iters: int = 10):
-    dim, dtype = samples.shape[-1], samples.dtype
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        diffs = rearrange(samples, "n d -> n () d") - rearrange(
-            means, "c d -> () c d"
-        )
-        dists = -(diffs ** 2).sum(dim=-1)
-        buckets = dists.max(dim=-1).indices
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
-        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
-        new_means = new_means / bins_min_clamped[..., None]
-        means = torch.where(zero_mask[..., None], means, new_means)
-    return means, bins
-class EuclideanCodebook(nn.Module):
-    """Codebook with Euclidean distance.
-    Args:
-        dim (int): Dimension.
-        codebook_size (int): Codebook size.
-        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
-            If set to true, run the k-means algorithm on the first training batch and use
-            the learned centroids as initialization.
-        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        kmeans_init: int = False,
-        kmeans_iters: int = 10,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
-        embed = init_fn(codebook_size, dim)
-        self.codebook_size = codebook_size
-        self.kmeans_iters = kmeans_iters
-        self.epsilon = epsilon
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
-        self.register_buffer("cluster_size", torch.zeros(codebook_size))
-        self.register_buffer("embed", embed)
-        self.register_buffer("embed_avg", embed.clone())
-        self.runed_steps = 0
-        self.stop_steps = 50_000
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        if self.inited:
-            return
-        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
-        self.embed.data.copy_(embed)
-        self.embed_avg.data.copy_(embed.clone())
-        self.cluster_size.data.copy_(cluster_size)
-        self.inited.data.copy_(torch.Tensor([True]))
-        # Make sure all buffers across workers are in sync after initialization
-        distrib.broadcast_tensors(self.buffers())
-    def replace_(self, samples, mask):
-        modified_codebook = torch.where(
-            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
-        )
-        self.embed.data.copy_(modified_codebook)
-    def expire_codes_(self, batch_samples):
-        if self.threshold_ema_dead_code == 0:
-            return
-        expired_codes = self.cluster_size < self.threshold_ema_dead_code
-        if not torch.any(expired_codes):
-            return
-        batch_samples = rearrange(batch_samples, "... d -> (...) d")
-        self.replace_(batch_samples, mask=expired_codes)
-        # distrib.broadcast_tensors(self.buffers())
-    def preprocess(self, x):
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    def quantize(self, x):
-        embed = self.embed.t()
-        dist = -(
-            x.pow(2).sum(1, keepdim=True)
-            - 2 * x @ embed
-            + embed.pow(2).sum(0, keepdim=True)
-        )
-        embed_ind = dist.max(dim=-1).indices
-        return embed_ind
-    def postprocess_emb(self, embed_ind, shape):
-        return embed_ind.view(*shape[:-1])
-    def dequantize(self, embed_ind):
-        quantize = F.embedding(embed_ind, self.embed)
-        return quantize
-    def encode(self, x):
-        shape = x.shape
-        # pre-process
-        x = self.preprocess(x)
-        # quantize
-        embed_ind = self.quantize(x)
-        # post-process
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        return embed_ind
-    def decode(self, embed_ind):
-        quantize = self.dequantize(embed_ind)
-        return quantize
-    def forward(self, x):
-        shape, dtype = x.shape, x.dtype
-        x = self.preprocess(x)
-        # self.init_embed_(x)
-        embed_ind = self.quantize(x)
-        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        quantize = self.dequantize(embed_ind)
-        self.runed_steps += 1
-        if self.training and self.runed_steps < self.stop_steps:
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            self.expire_codes_(x)
-            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
-            embed_sum = x.t() @ embed_onehot
-            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
-            cluster_size = (
-                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
-                * self.cluster_size.sum()
-            )
-            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
-            self.embed.data.copy_(embed_normalized)
-        return quantize, embed_ind
-class VectorQuantization(nn.Module):
-    """Vector quantization implementation.
-    Currently supports only euclidean distance.
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-        commitment_weight (float): Weight for commitment loss.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        codebook_dim: tp.Optional[int] = None,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 50,
-        threshold_ema_dead_code: int = 2,
-        commitment_weight: float = 1.,
-    ):
-        super().__init__()
-        _codebook_dim: int = default(codebook_dim, dim)
-        requires_projection = _codebook_dim != dim
-        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
-        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
-        self.epsilon = epsilon
-        self.commitment_weight = commitment_weight
-        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
-                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
-                                           decay=decay, epsilon=epsilon,
-                                           threshold_ema_dead_code=threshold_ema_dead_code)
-        self.codebook_size = codebook_size
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    def encode(self, x):
-        x = rearrange(x, "b d n -> b n d")
-        x = self.project_in(x)
-        embed_in = self._codebook.encode(x)
-        return embed_in
-    def decode(self, embed_ind):
-        quantize = self._codebook.decode(embed_ind)
-        quantize = self.project_out(quantize)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-    def forward(self, x, do_debug=False):
-        device = x.device
-        x = rearrange(x, "b d n -> b n d")
-        x = self.project_in(x)
-        quantize, embed_ind = self._codebook(x)
-        if self.training:
-            quantize = x + (quantize - x).detach()
-        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
-        if self.training:
-            if self.commitment_weight > 0:
-                commit_loss = F.mse_loss(quantize.detach(), x)
-                loss = loss + commit_loss * self.commitment_weight
-        quantize = self.project_out(quantize)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize, embed_ind, loss
-class ResidualVectorQuantization(nn.Module):
-    """Residual vector quantization implementation.
-    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
-    """
-    def __init__(self, *, num_quantizers, **kwargs):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
-        )
-    def forward(self, x, n_q: tp.Optional[int] = None):
-        quantized_out = 0.0
-        residual = x
-        all_losses = []
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layerinx, layer in enumerate(self.layers[:n_q]):
-            print("Layer {} Used ratio {:.1f}".format(layerinx, (layer._codebook.cluster_size > 1.0).sum() / layer._codebook.cluster_size.shape[0] * 100.))
-            quantized, indices, loss = layer(residual)
-            residual = residual - quantized
-            quantized_out = quantized_out + quantized
-            all_indices.append(indices)
-            all_losses.append(loss)
-        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
-        return quantized_out, out_indices, out_losses
-    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
-        residual = x
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            indices = layer.encode(residual)
-            quantized = layer.decode(indices)
-            residual = residual - quantized
-            all_indices.append(indices)
-        out_indices = torch.stack(all_indices)
-        return out_indices
-    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
-        quantized_out = torch.tensor(0.0, device=q_indices.device)
-        for i, indices in enumerate(q_indices):
-            layer = self.layers[i]
-            quantized = layer.decode(indices)
-            quantized_out = quantized_out + quantized
-        return quantized_out

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This implementation is inspired from
+# https://github.com/lucidrains/vector-quantize-pytorch
+# which is released under MIT License. Hereafter, the original license:
+# MIT License
+#
+# Copyright (c) 2020 Phil Wang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Core vector quantization implementation."""
+import typing as tp
+from einops import rearrange, repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+# from .. import distrib
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        diffs = rearrange(samples, "n d -> n () d") - rearrange(
+            means, "c d -> () c d"
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+        self.runed_steps = 0
+        self.stop_steps = 50_000
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        distrib.broadcast_tensors(self.buffers())
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+        # distrib.broadcast_tensors(self.buffers())
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        # self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        self.runed_steps += 1
+        if self.training and self.runed_steps < self.stop_steps:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x, do_debug=False):
+        device = x.device
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layerinx, layer in enumerate(self.layers[:n_q]):
+            print("Layer {} Used ratio {:.1f}".format(layerinx, (layer._codebook.cluster_size > 1.0).sum() / layer._codebook.cluster_size.shape[0] * 100.))
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out

codeclm/tokenizer/Flow1dVAE/model_1rvq.py CHANGED Viewed

@@ -1,710 +1,710 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from tools.torch_tools import wav_to_fbank
-from diffusers.utils.torch_utils import randn_tensor
-from transformers import HubertModel
-from libs.rvq.descript_quantize3 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        # print(rescale, self.mean)
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp,
-        ssl_layer
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-        self.ssl_layer = ssl_layer
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            # print("incontext_x.shape:",incontext_x.shape)
-            # print("noise.shape:",noise.shape)
-            # print("t.shape:",t.shape)
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            # print("dphi_dt.shape:",dphi_dt.shape)
-            # print("x.shape:",x.shape)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # print("y.shape:",y.shape)
-        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        # print("model_input.shape:",model_input.shape)
-        # print("attention_mask.shape:",attention_mask.shape)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer = out.hidden_states[self.ssl_layer]
-        hidden_proj = self.mlp(hidden_layer)
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("mert_emb.shape:",mert_emb.shape)
-        # exit()
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        # out=self.proj_out(out)
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        # print("out.shape",out.shape)
-        # print("u.shape",u.shape)
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        # print("loss_cos:",loss_cos,loss_cos.device)
-        print("loss:",loss,loss.device)
-        # exit()
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        hubert_layer=None,
-        ssl_layer=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        self.hubert_layer = hubert_layer
-        self.ssl_layer = ssl_layer
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq',
-            checkpoint_dir='ckpt/encode-s12k.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        for v in self.rvq_bestrq_emb.parameters():v.requires_grad = False
-        self.hubert = HubertModelWithFinalProj.from_pretrained("ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(1200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
-        self.mask_emb = torch.nn.Embedding(3, 48)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-        # self.unet.set_attn_processor(AttnProcessor2_0())
-        # self.unet.set_use_memory_efficient_attention_xformers(True)
-        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
-        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        # print(wav2vec_embeds)
-        # print("audio.shape:",input_audios.shape)
-        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
-        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
-        self.bestrq.eval()
-        # print("audio shape:",input_audio_0.shape)
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        # print("input_wav_mean.shape:",input_wav_mean.shape)
-        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        # print("layer_results.shape:",layer_results[layer].shape)
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        #[b,t,1024] t=t/960
-        #35.84s->batch,896,1024
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer=5):
-        if not hasattr(self,"device"):
-            self.device = input_audios.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios.dtype
-        device = self.device
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
-        # energy_embedding = self.extract_energy_bar(input_audios)
-        # print("energy_embedding.shape:",energy_embedding.shape)
-        # with autocast(enabled=False):
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
-                bestrq_emb = bestrq_emb.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        # prompt_embeds_13 = torch.cat([mert_emb_13, energy_embedding], 1)
-        # print("prompt_embes.shape:",prompt_embeds.shape)
-        #prompt_embes.shape:  torch.Size([3, 1088, 896])
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        #wav2vec_embeds.shape:torch.Size([3, 1024, 896])
-        if(train_rvq):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb
-        codebook_loss = codebook_loss_bestrq_emb
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-        # print("latents.shape:",latents.shape)
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        # print("incontext_latents.shape:",incontext_latents.shape)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        latent_mask_input = self.mask_emb(latent_masks)
-        #64+48+64+1024
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb = codes[0]
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    @torch.no_grad()
-    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        import time
-        start = time.time()
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents,time.time()-start
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

+import yaml
+import random
+import inspect
+import numpy as np
+from tqdm import tqdm
+import typing as tp
+from abc import ABC
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from tools.torch_tools import wav_to_fbank
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import HubertModel
+from libs.rvq.descript_quantize3 import ResidualVectorQuantize
+from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
+from models_gpt.models.gpt2_config import GPT2Config
+from torch.cuda.amp import autocast
+from our_MERT_BESTRQ.test import load_model
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        print("hidden_size:",config.hidden_size)
+        print("classifier_proj_size:",config.classifier_proj_size)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        """Project the original sample to the 'space' where the diffusion will happen."""
+        """Project back from diffusion space to the actual sample space."""
+        return z
+class Feature1DProcessor(SampleProcessor):
+    def __init__(self, dim: int = 100, power_std = 1., \
+                 num_samples: int = 100_000, cal_num_frames: int = 600):
+        super().__init__()
+        self.num_samples = num_samples
+        self.dim = dim
+        self.power_std = power_std
+        self.cal_num_frames = cal_num_frames
+        self.register_buffer('counts', torch.zeros(1))
+        self.register_buffer('sum_x', torch.zeros(dim))
+        self.register_buffer('sum_x2', torch.zeros(dim))
+        self.register_buffer('sum_target_x2', torch.zeros(dim))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        if(self.counts < 10):
+            mean = torch.zeros_like(mean)
+        return mean
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        if(self.counts < 10):
+            std = torch.ones_like(std)
+        return std
+    @property
+    def target_std(self):
+        return 1
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        if self.counts.item() < self.num_samples:
+            self.counts += len(x)
+            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
+            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
+        return x
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        rescale = (self.std / self.target_std) ** self.power_std
+        # print(rescale, self.mean)
+        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
+        return x
+def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
+    if(prior_text_encoder_hidden_states.shape[1]<len_size):
+        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
+            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
+            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
+            dtype=prior_text_encoder_hidden_states.dtype)],1)
+        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
+    else:
+        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
+        prior_text_mask = prior_text_mask[:,0:len_size]
+    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
+    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        estimator,
+        mlp,
+        ssl_layer
+    ):
+        super().__init__()
+        self.sigma_min = 1e-4
+        self.estimator = estimator
+        self.mlp = mlp
+        self.ssl_layer = ssl_layer
+    @torch.inference_mode()
+    def forward(self, mu, n_timesteps, temperature=1.0):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span)
+    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            # print("incontext_x.shape:",incontext_x.shape)
+            # print("noise.shape:",noise.shape)
+            # print("t.shape:",t.shape)
+            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
+            if(guidance_scale > 1.0):
+                model_input = torch.cat([ \
+                    torch.cat([latent_mask_input, latent_mask_input], 0), \
+                    torch.cat([incontext_x, incontext_x], 0), \
+                    torch.cat([torch.zeros_like(mu), mu], 0), \
+                    torch.cat([x, x], 0), \
+                    ], 2)
+                timestep=t.unsqueeze(-1).repeat(2)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
+            else:
+                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
+                timestep=t.unsqueeze(-1)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
+            # print("dphi_dt.shape:",dphi_dt.shape)
+            # print("x.shape:",x.shape)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def projection_loss(self,hidden_proj, bestrq_emb):
+        bsz = hidden_proj.shape[0]
+        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
+        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
+        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
+        proj_loss = 1+proj_loss.mean()
+        return proj_loss
+    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        b = mu[0].shape[0]
+        len_x = x1.shape[2]
+        # random timestep
+        if(validation_mode):
+            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
+        else:
+            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # print("y.shape:",y.shape)
+        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
+        model_input = torch.cat([*mu,y], 2)
+        t=t.squeeze(-1).squeeze(-1)
+        # print("model_input.shape:",model_input.shape)
+        # print("attention_mask.shape:",attention_mask.shape)
+        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
+        hidden_layer = out.hidden_states[self.ssl_layer]
+        hidden_proj = self.mlp(hidden_layer)
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("mert_emb.shape:",mert_emb.shape)
+        # exit()
+        out = out.last_hidden_state
+        out=out[:,:,-len_x:]
+        # out=self.proj_out(out)
+        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
+        # print("out.shape",out.shape)
+        # print("u.shape",u.shape)
+        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
+        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
+        loss = loss_re + loss_cos * 0.5
+        # print("loss_cos:",loss_cos,loss_cos.device)
+        print("loss:",loss,loss.device)
+        # exit()
+        return loss, loss_re, loss_cos
+class PromptCondAudioDiffusion(nn.Module):
+    def __init__(
+        self,
+        num_channels,
+        unet_model_name=None,
+        unet_model_config_path=None,
+        snr_gamma=None,
+        hubert_layer=None,
+        ssl_layer=None,
+        uncondition=True,
+        out_paint=False,
+    ):
+        super().__init__()
+        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
+        self.unet_model_name = unet_model_name
+        self.unet_model_config_path = unet_model_config_path
+        self.snr_gamma = snr_gamma
+        self.uncondition = uncondition
+        self.num_channels = num_channels
+        self.hubert_layer = hubert_layer
+        self.ssl_layer = ssl_layer
+        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
+        self.normfeat = Feature1DProcessor(dim=64)
+        self.sample_rate = 48000
+        self.num_samples_perseg = self.sample_rate * 20 // 1000
+        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
+        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        self.bestrq = load_model(
+            model_dir='codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq',
+            checkpoint_dir='ckpt/encode-s12k.pt',
+        )
+        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
+        for v in self.bestrq.parameters():v.requires_grad = False
+        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        for v in self.rvq_bestrq_emb.parameters():v.requires_grad = False
+        self.hubert = HubertModelWithFinalProj.from_pretrained("ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
+        for v in self.hubert.parameters():v.requires_grad = False
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
+        # self.xvecmodel = XVECModel()
+        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
+        unet = GPT2Model(config)
+        mlp =  nn.Sequential(
+            nn.Linear(1200, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 768)
+        )
+        self.set_from = "random"
+        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
+        self.mask_emb = torch.nn.Embedding(3, 48)
+        print("Transformer initialized from pretrain.")
+        torch.cuda.empty_cache()
+        # self.unet.set_attn_processor(AttnProcessor2_0())
+        # self.unet.set_use_memory_efficient_attention_xformers(True)
+        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
+        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
+    def compute_snr(self, timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+    def preprocess_audio(self, input_audios, threshold=0.9):
+        assert len(input_audios.shape) == 2, input_audios.shape
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios/norm_value.unsqueeze(-1)
+    def extract_wav2vec_embeds(self, input_audios,output_len):
+        wav2vec_stride = 2
+        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
+        # print(wav2vec_embeds)
+        # print("audio.shape:",input_audios.shape)
+        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
+        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
+        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
+        return wav2vec_embeds_last
+    def extract_mert_embeds(self, input_audios):
+        prompt_stride = 3
+        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
+        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
+        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
+        mert_emb= prompt_embeds[-1]
+        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
+        return mert_emb
+    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
+        self.bestrq.eval()
+        # print("audio shape:",input_audio_0.shape)
+        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
+        # print("input_wav_mean.shape:",input_wav_mean.shape)
+        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
+        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
+        layer_results = input_wav_mean['layer_results']
+        # print("layer_results.shape:",layer_results[layer].shape)
+        bestrq_emb = layer_results[layer]
+        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
+        #[b,t,1024] t=t/960
+        #35.84s->batch,896,1024
+        return bestrq_emb
+    def extract_spk_embeds(self, input_audios):
+        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
+        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
+        return spk_embeds
+    def extract_lyric_feats(self, lyric):
+        with torch.no_grad():
+            try:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
+            except:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
+            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
+            text_mask = text_mask.to(self.device)
+            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
+                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
+            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
+            return text_encoder_hidden_states, text_mask
+    def extract_energy_bar(self, input_audios):
+        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
+            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        else:
+            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
+        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
+        energy_embedding = self.energy_embedding(energy_bar)
+        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
+        return energy_embedding
+    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
+        additional_feats = ['spk', 'lyric'], \
+        train_rvq=True, train_ssl=False,layer=5):
+        if not hasattr(self,"device"):
+            self.device = input_audios.device
+        if not hasattr(self,"dtype"):
+            self.dtype = input_audios.dtype
+        device = self.device
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
+        # energy_embedding = self.extract_energy_bar(input_audios)
+        # print("energy_embedding.shape:",energy_embedding.shape)
+        # with autocast(enabled=False):
+        if(train_ssl):
+            self.wav2vec.train()
+            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
+            self.clap_embd_extractor.train()
+            prompt_embeds = self.extract_mert_embeds(input_audios)
+            if('spk' in additional_feats):
+                self.xvecmodel.train()
+                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
+        else:
+            with torch.no_grad():
+                with autocast(enabled=False):
+                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
+                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
+                bestrq_emb = bestrq_emb.detach()
+        if('lyric' in additional_feats):
+            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
+        else:
+            text_encoder_hidden_states, text_mask = None, None
+        # prompt_embeds_13 = torch.cat([mert_emb_13, energy_embedding], 1)
+        # print("prompt_embes.shape:",prompt_embeds.shape)
+        #prompt_embes.shape:  torch.Size([3, 1088, 896])
+        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
+        #wav2vec_embeds.shape:torch.Size([3, 1024, 896])
+        if(train_rvq):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+        else:
+            bestrq_emb = bestrq_emb.float()
+            self.rvq_bestrq_emb.eval()
+            # with autocast(enabled=False):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
+            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
+            quantized_bestrq_emb = quantized_bestrq_emb.detach()
+        commitment_loss = commitment_loss_bestrq_emb
+        codebook_loss = codebook_loss_bestrq_emb
+        alpha=1
+        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        scenario = np.random.choice(['start_seg', 'other_seg'])
+        if(scenario == 'other_seg'):
+            for binx in range(input_audios.shape[0]):
+                # latent_masks[binx,0:64] = 1
+                latent_masks[binx,0:random.randint(64,128)] = 1
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        if self.uncondition:
+            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
+            if len(mask_indices) > 0:
+                quantized_bestrq_emb[mask_indices] = 0
+        # print("latents.shape:",latents.shape)
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.project_sample(latents)
+        latents = latents.permute(0,2,1).contiguous()
+        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        # print("incontext_latents.shape:",incontext_latents.shape)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        latent_mask_input = self.mask_emb(latent_masks)
+        #64+48+64+1024
+        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
+        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
+    def init_device_dtype(self, device, dtype):
+        self.device = device
+        self.dtype = dtype
+    @torch.no_grad()
+    def fetch_codes(self, input_audios, additional_feats,layer):
+        input_audio_0 = input_audios[[0],:]
+        input_audio_1 = input_audios[[1],:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch(self, input_audios, additional_feats,layer):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+    @torch.no_grad()
+    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
+                  guidance_scale=2, num_steps=20,
+                  disable_progress=True, scenario='start_seg'):
+        classifier_free_guidance = guidance_scale > 1.0
+        device = self.device
+        dtype = self.dtype
+        # codes_bestrq_middle, codes_bestrq_last = codes
+        codes_bestrq_emb = codes[0]
+        batch_size = codes_bestrq_emb.shape[0]
+        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        if('spk' in additional_feats):
+            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
+        num_frames = quantized_bestrq_emb.shape[1]
+        num_channels_latents = self.num_channels
+        shape = (batch_size,  num_frames, 64)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
+        latent_masks[:,0:latent_length] = 2
+        if(scenario=='other_seg'):
+            latent_masks[:,0:incontext_length] = 1
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        true_latents = self.normfeat.project_sample(true_latents)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        latent_mask_input = self.mask_emb(latent_masks)
+        if('spk' in additional_feats):
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
+        else:
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb],1)
+        temperature = 1.0
+        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
+        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
+        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.return_sample(latents)
+        # latents = latents.permute(0,2,1).contiguous()
+        return latents
+    @torch.no_grad()
+    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents
+    @torch.no_grad()
+    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
+        import time
+        start = time.time()
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents,time.time()-start
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
+        divisor = 4
+        shape = (batch_size, num_channels_latents, num_frames, 32)
+        if(num_frames%divisor>0):
+            num_frames = round(num_frames/float(divisor))*divisor
+            shape = (batch_size, num_channels_latents, num_frames, 32)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        return latents

codeclm/tokenizer/Flow1dVAE/model_2rvq.py CHANGED Viewed

@@ -1,774 +1,774 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-import diffusers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers import DDPMScheduler
-from models.transformer_2d_flow import Transformer2DModel
-from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
-# from tools.get_mulan import get_mulan
-from third_party.wespeaker.extract_embd import XVECModel
-# from libs.rvq2 import RVQEmbedding
-from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        # print(rescale, self.mean)
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp,
-        ssl_layer
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-        self.ssl_layer = ssl_layer
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            # print("incontext_x.shape:",incontext_x.shape)
-            # print("noise.shape:",noise.shape)
-            # print("t.shape:",t.shape)
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            # print("dphi_dt.shape:",dphi_dt.shape)
-            # print("x.shape:",x.shape)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # print("y.shape:",y.shape)
-        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        # print("model_input.shape:",model_input.shape)
-        # print("attention_mask.shape:",attention_mask.shape)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer = out.hidden_states[self.ssl_layer]
-        hidden_proj = self.mlp(hidden_layer)
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("mert_emb.shape:",mert_emb.shape)
-        # exit()
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        # out=self.proj_out(out)
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        # print("out.shape",out.shape)
-        # print("u.shape",u.shape)
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        # print("loss_cos:",loss_cos,loss_cos.device)
-        print("loss:",loss,loss.device)
-        # exit()
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        hubert_layer=None,
-        ssl_layer=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        self.hubert_layer = hubert_layer
-        self.ssl_layer = ssl_layer
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='path/to/our-MERT/mert_fairseq',
-            checkpoint_dir='checkpoint-120000.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 2, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        # for v in self.rvq_bestrq_emb.parameters():
-        #     print(v)
-        freeze_parameters='quantizers.0'
-        for name, param in self.rvq_bestrq_emb.named_parameters():
-            if freeze_parameters in name:
-                param.requires_grad = False
-                print("Freezing RVQ parameters:", name)
-        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(1200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
-        self.mask_emb = torch.nn.Embedding(3, 48)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-        # self.unet.set_attn_processor(AttnProcessor2_0())
-        # self.unet.set_use_memory_efficient_attention_xformers(True)
-        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
-        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        # print(wav2vec_embeds)
-        # print("audio.shape:",input_audios.shape)
-        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
-        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
-        self.bestrq.eval()
-        # print("audio shape:",input_audio_0.shape)
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        # print("input_wav_mean.shape:",input_wav_mean.shape)
-        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        # print("layer_results.shape:",layer_results[layer].shape)
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        #[b,t,1024] t=t/960
-        #35.84s->batch,896,1024
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer=5):
-        if not hasattr(self,"device"):
-            self.device = input_audios.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios.dtype
-        device = self.device
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
-        # energy_embedding = self.extract_energy_bar(input_audios)
-        # print("energy_embedding.shape:",energy_embedding.shape)
-        # with autocast(enabled=False):
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
-                bestrq_emb = bestrq_emb.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        if(train_rvq):
-            random_num=random.random()
-            if(random_num<0.6):
-                rvq_layer = 1
-            elif(random_num<0.8):
-                rvq_layer = 2
-            else:
-                rvq_layer = 4
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb
-        codebook_loss = codebook_loss_bestrq_emb
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-        # print("latents.shape:",latents.shape)
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        # print("incontext_latents.shape:",incontext_latents.shape)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        latent_mask_input = self.mask_emb(latent_masks)
-        #64+48+64+1024
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb = codes[0]
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    @torch.no_grad()
-    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        import time
-        start = time.time()
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents,time.time()-start
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

+import yaml
+import random
+import inspect
+import numpy as np
+from tqdm import tqdm
+import typing as tp
+from abc import ABC
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from einops import repeat
+from tools.torch_tools import wav_to_fbank
+import diffusers
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import DDPMScheduler
+from models.transformer_2d_flow import Transformer2DModel
+from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
+# from tools.get_mulan import get_mulan
+from third_party.wespeaker.extract_embd import XVECModel
+# from libs.rvq2 import RVQEmbedding
+from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
+from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
+from models_gpt.models.gpt2_config import GPT2Config
+from torch.cuda.amp import autocast
+from our_MERT_BESTRQ.test import load_model
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        print("hidden_size:",config.hidden_size)
+        print("classifier_proj_size:",config.classifier_proj_size)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        """Project the original sample to the 'space' where the diffusion will happen."""
+        """Project back from diffusion space to the actual sample space."""
+        return z
+class Feature1DProcessor(SampleProcessor):
+    def __init__(self, dim: int = 100, power_std = 1., \
+                 num_samples: int = 100_000, cal_num_frames: int = 600):
+        super().__init__()
+        self.num_samples = num_samples
+        self.dim = dim
+        self.power_std = power_std
+        self.cal_num_frames = cal_num_frames
+        self.register_buffer('counts', torch.zeros(1))
+        self.register_buffer('sum_x', torch.zeros(dim))
+        self.register_buffer('sum_x2', torch.zeros(dim))
+        self.register_buffer('sum_target_x2', torch.zeros(dim))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        if(self.counts < 10):
+            mean = torch.zeros_like(mean)
+        return mean
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        if(self.counts < 10):
+            std = torch.ones_like(std)
+        return std
+    @property
+    def target_std(self):
+        return 1
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        if self.counts.item() < self.num_samples:
+            self.counts += len(x)
+            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
+            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
+        return x
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        rescale = (self.std / self.target_std) ** self.power_std
+        # print(rescale, self.mean)
+        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
+        return x
+def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
+    if(prior_text_encoder_hidden_states.shape[1]<len_size):
+        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
+            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
+            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
+            dtype=prior_text_encoder_hidden_states.dtype)],1)
+        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
+    else:
+        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
+        prior_text_mask = prior_text_mask[:,0:len_size]
+    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
+    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        estimator,
+        mlp,
+        ssl_layer
+    ):
+        super().__init__()
+        self.sigma_min = 1e-4
+        self.estimator = estimator
+        self.mlp = mlp
+        self.ssl_layer = ssl_layer
+    @torch.inference_mode()
+    def forward(self, mu, n_timesteps, temperature=1.0):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span)
+    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            # print("incontext_x.shape:",incontext_x.shape)
+            # print("noise.shape:",noise.shape)
+            # print("t.shape:",t.shape)
+            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
+            if(guidance_scale > 1.0):
+                model_input = torch.cat([ \
+                    torch.cat([latent_mask_input, latent_mask_input], 0), \
+                    torch.cat([incontext_x, incontext_x], 0), \
+                    torch.cat([torch.zeros_like(mu), mu], 0), \
+                    torch.cat([x, x], 0), \
+                    ], 2)
+                timestep=t.unsqueeze(-1).repeat(2)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
+            else:
+                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
+                timestep=t.unsqueeze(-1)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
+            # print("dphi_dt.shape:",dphi_dt.shape)
+            # print("x.shape:",x.shape)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def projection_loss(self,hidden_proj, bestrq_emb):
+        bsz = hidden_proj.shape[0]
+        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
+        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
+        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
+        proj_loss = 1+proj_loss.mean()
+        return proj_loss
+    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        b = mu[0].shape[0]
+        len_x = x1.shape[2]
+        # random timestep
+        if(validation_mode):
+            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
+        else:
+            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # print("y.shape:",y.shape)
+        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
+        model_input = torch.cat([*mu,y], 2)
+        t=t.squeeze(-1).squeeze(-1)
+        # print("model_input.shape:",model_input.shape)
+        # print("attention_mask.shape:",attention_mask.shape)
+        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
+        hidden_layer = out.hidden_states[self.ssl_layer]
+        hidden_proj = self.mlp(hidden_layer)
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("mert_emb.shape:",mert_emb.shape)
+        # exit()
+        out = out.last_hidden_state
+        out=out[:,:,-len_x:]
+        # out=self.proj_out(out)
+        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
+        # print("out.shape",out.shape)
+        # print("u.shape",u.shape)
+        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
+        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
+        loss = loss_re + loss_cos * 0.5
+        # print("loss_cos:",loss_cos,loss_cos.device)
+        print("loss:",loss,loss.device)
+        # exit()
+        return loss, loss_re, loss_cos
+class PromptCondAudioDiffusion(nn.Module):
+    def __init__(
+        self,
+        num_channels,
+        unet_model_name=None,
+        unet_model_config_path=None,
+        snr_gamma=None,
+        hubert_layer=None,
+        ssl_layer=None,
+        uncondition=True,
+        out_paint=False,
+    ):
+        super().__init__()
+        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
+        self.unet_model_name = unet_model_name
+        self.unet_model_config_path = unet_model_config_path
+        self.snr_gamma = snr_gamma
+        self.uncondition = uncondition
+        self.num_channels = num_channels
+        self.hubert_layer = hubert_layer
+        self.ssl_layer = ssl_layer
+        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
+        self.normfeat = Feature1DProcessor(dim=64)
+        self.sample_rate = 48000
+        self.num_samples_perseg = self.sample_rate * 20 // 1000
+        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
+        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        self.bestrq = load_model(
+            model_dir='path/to/our-MERT/mert_fairseq',
+            checkpoint_dir='checkpoint-120000.pt',
+        )
+        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
+        for v in self.bestrq.parameters():v.requires_grad = False
+        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 2, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        # for v in self.rvq_bestrq_emb.parameters():
+        #     print(v)
+        freeze_parameters='quantizers.0'
+        for name, param in self.rvq_bestrq_emb.named_parameters():
+            if freeze_parameters in name:
+                param.requires_grad = False
+                print("Freezing RVQ parameters:", name)
+        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
+        for v in self.hubert.parameters():v.requires_grad = False
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
+        # self.xvecmodel = XVECModel()
+        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
+        unet = GPT2Model(config)
+        mlp =  nn.Sequential(
+            nn.Linear(1200, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 768)
+        )
+        self.set_from = "random"
+        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
+        self.mask_emb = torch.nn.Embedding(3, 48)
+        print("Transformer initialized from pretrain.")
+        torch.cuda.empty_cache()
+        # self.unet.set_attn_processor(AttnProcessor2_0())
+        # self.unet.set_use_memory_efficient_attention_xformers(True)
+        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
+        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
+    def compute_snr(self, timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+    def preprocess_audio(self, input_audios, threshold=0.9):
+        assert len(input_audios.shape) == 2, input_audios.shape
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios/norm_value.unsqueeze(-1)
+    def extract_wav2vec_embeds(self, input_audios,output_len):
+        wav2vec_stride = 2
+        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
+        # print(wav2vec_embeds)
+        # print("audio.shape:",input_audios.shape)
+        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
+        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
+        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
+        return wav2vec_embeds_last
+    def extract_mert_embeds(self, input_audios):
+        prompt_stride = 3
+        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
+        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
+        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
+        mert_emb= prompt_embeds[-1]
+        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
+        return mert_emb
+    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
+        self.bestrq.eval()
+        # print("audio shape:",input_audio_0.shape)
+        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
+        # print("input_wav_mean.shape:",input_wav_mean.shape)
+        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
+        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
+        layer_results = input_wav_mean['layer_results']
+        # print("layer_results.shape:",layer_results[layer].shape)
+        bestrq_emb = layer_results[layer]
+        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
+        #[b,t,1024] t=t/960
+        #35.84s->batch,896,1024
+        return bestrq_emb
+    def extract_spk_embeds(self, input_audios):
+        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
+        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
+        return spk_embeds
+    def extract_lyric_feats(self, lyric):
+        with torch.no_grad():
+            try:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
+            except:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
+            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
+            text_mask = text_mask.to(self.device)
+            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
+                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
+            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
+            return text_encoder_hidden_states, text_mask
+    def extract_energy_bar(self, input_audios):
+        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
+            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        else:
+            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
+        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
+        energy_embedding = self.energy_embedding(energy_bar)
+        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
+        return energy_embedding
+    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
+        additional_feats = ['spk', 'lyric'], \
+        train_rvq=True, train_ssl=False,layer=5):
+        if not hasattr(self,"device"):
+            self.device = input_audios.device
+        if not hasattr(self,"dtype"):
+            self.dtype = input_audios.dtype
+        device = self.device
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
+        # energy_embedding = self.extract_energy_bar(input_audios)
+        # print("energy_embedding.shape:",energy_embedding.shape)
+        # with autocast(enabled=False):
+        if(train_ssl):
+            self.wav2vec.train()
+            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
+            self.clap_embd_extractor.train()
+            prompt_embeds = self.extract_mert_embeds(input_audios)
+            if('spk' in additional_feats):
+                self.xvecmodel.train()
+                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
+        else:
+            with torch.no_grad():
+                with autocast(enabled=False):
+                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
+                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
+                bestrq_emb = bestrq_emb.detach()
+        if('lyric' in additional_feats):
+            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
+        else:
+            text_encoder_hidden_states, text_mask = None, None
+        if(train_rvq):
+            random_num=random.random()
+            if(random_num<0.6):
+                rvq_layer = 1
+            elif(random_num<0.8):
+                rvq_layer = 2
+            else:
+                rvq_layer = 4
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
+        else:
+            bestrq_emb = bestrq_emb.float()
+            self.rvq_bestrq_emb.eval()
+            # with autocast(enabled=False):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
+            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
+            quantized_bestrq_emb = quantized_bestrq_emb.detach()
+        commitment_loss = commitment_loss_bestrq_emb
+        codebook_loss = codebook_loss_bestrq_emb
+        alpha=1
+        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        scenario = np.random.choice(['start_seg', 'other_seg'])
+        if(scenario == 'other_seg'):
+            for binx in range(input_audios.shape[0]):
+                # latent_masks[binx,0:64] = 1
+                latent_masks[binx,0:random.randint(64,128)] = 1
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        if self.uncondition:
+            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
+            if len(mask_indices) > 0:
+                quantized_bestrq_emb[mask_indices] = 0
+        # print("latents.shape:",latents.shape)
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.project_sample(latents)
+        latents = latents.permute(0,2,1).contiguous()
+        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        # print("incontext_latents.shape:",incontext_latents.shape)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        latent_mask_input = self.mask_emb(latent_masks)
+        #64+48+64+1024
+        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
+        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
+    def init_device_dtype(self, device, dtype):
+        self.device = device
+        self.dtype = dtype
+    @torch.no_grad()
+    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
+        input_audio_0 = input_audios[[0],:]
+        input_audio_1 = input_audios[[1],:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
+                  guidance_scale=2, num_steps=20,
+                  disable_progress=True, scenario='start_seg'):
+        classifier_free_guidance = guidance_scale > 1.0
+        device = self.device
+        dtype = self.dtype
+        # codes_bestrq_middle, codes_bestrq_last = codes
+        codes_bestrq_emb = codes[0]
+        batch_size = codes_bestrq_emb.shape[0]
+        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        if('spk' in additional_feats):
+            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
+        num_frames = quantized_bestrq_emb.shape[1]
+        num_channels_latents = self.num_channels
+        shape = (batch_size,  num_frames, 64)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
+        latent_masks[:,0:latent_length] = 2
+        if(scenario=='other_seg'):
+            latent_masks[:,0:incontext_length] = 1
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        true_latents = self.normfeat.project_sample(true_latents)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        latent_mask_input = self.mask_emb(latent_masks)
+        if('spk' in additional_feats):
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
+        else:
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb],1)
+        temperature = 1.0
+        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
+        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
+        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.return_sample(latents)
+        # latents = latents.permute(0,2,1).contiguous()
+        return latents
+    @torch.no_grad()
+    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents
+    @torch.no_grad()
+    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
+        import time
+        start = time.time()
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents,time.time()-start
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
+        divisor = 4
+        shape = (batch_size, num_channels_latents, num_frames, 32)
+        if(num_frames%divisor>0):
+            num_frames = round(num_frames/float(divisor))*divisor
+            shape = (batch_size, num_channels_latents, num_frames, 32)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        return latents

codeclm/tokenizer/Flow1dVAE/model_4rvq.py CHANGED Viewed

@@ -1,774 +1,774 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-import diffusers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers import DDPMScheduler
-from models.transformer_2d_flow import Transformer2DModel
-from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
-# from tools.get_mulan import get_mulan
-from third_party.wespeaker.extract_embd import XVECModel
-# from libs.rvq2 import RVQEmbedding
-from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        # print(rescale, self.mean)
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp,
-        ssl_layer
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-        self.ssl_layer = ssl_layer
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            print("incontext_x.shape:",incontext_x.shape)
-            print("noise.shape:",noise.shape)
-            print("t.shape:",t.shape)
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            print("dphi_dt.shape:",dphi_dt.shape)
-            print("x.shape:",x.shape)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # print("y.shape:",y.shape)
-        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        # print("model_input.shape:",model_input.shape)
-        # print("attention_mask.shape:",attention_mask.shape)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer = out.hidden_states[self.ssl_layer]
-        hidden_proj = self.mlp(hidden_layer)
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("mert_emb.shape:",mert_emb.shape)
-        # exit()
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        # out=self.proj_out(out)
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        # print("out.shape",out.shape)
-        # print("u.shape",u.shape)
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        # print("loss_cos:",loss_cos,loss_cos.device)
-        print("loss:",loss,loss.device)
-        # exit()
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        hubert_layer=None,
-        ssl_layer=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        self.hubert_layer = hubert_layer
-        self.ssl_layer = ssl_layer
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='path/to/our-MERT/mert_fairseq',
-            checkpoint_dir='checkpoint-120000.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        # for v in self.rvq_bestrq_emb.parameters():
-        #     print(v)
-        freeze_parameters='quantizers.0'
-        for name, param in self.rvq_bestrq_emb.named_parameters():
-            if freeze_parameters in name:
-                param.requires_grad = False
-                print("Freezing RVQ parameters:", name)
-        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(1200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
-        self.mask_emb = torch.nn.Embedding(3, 48)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-        # self.unet.set_attn_processor(AttnProcessor2_0())
-        # self.unet.set_use_memory_efficient_attention_xformers(True)
-        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
-        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        # print(wav2vec_embeds)
-        # print("audio.shape:",input_audios.shape)
-        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
-        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
-        self.bestrq.eval()
-        # print("audio shape:",input_audio_0.shape)
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        # print("input_wav_mean.shape:",input_wav_mean.shape)
-        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        # print("layer_results.shape:",layer_results[layer].shape)
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        #[b,t,1024] t=t/960
-        #35.84s->batch,896,1024
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer=5):
-        if not hasattr(self,"device"):
-            self.device = input_audios.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios.dtype
-        device = self.device
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
-        # energy_embedding = self.extract_energy_bar(input_audios)
-        # print("energy_embedding.shape:",energy_embedding.shape)
-        # with autocast(enabled=False):
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
-                bestrq_emb = bestrq_emb.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        if(train_rvq):
-            random_num=random.random()
-            if(random_num<0.6):
-                rvq_layer = 1
-            elif(random_num<0.8):
-                rvq_layer = 2
-            else:
-                rvq_layer = 4
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb
-        codebook_loss = codebook_loss_bestrq_emb
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-        # print("latents.shape:",latents.shape)
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        # print("incontext_latents.shape:",incontext_latents.shape)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        latent_mask_input = self.mask_emb(latent_masks)
-        #64+48+64+1024
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb = codes[0]
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    @torch.no_grad()
-    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        import time
-        start = time.time()
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents,time.time()-start
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

+import yaml
+import random
+import inspect
+import numpy as np
+from tqdm import tqdm
+import typing as tp
+from abc import ABC
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from einops import repeat
+from tools.torch_tools import wav_to_fbank
+import diffusers
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import DDPMScheduler
+from models.transformer_2d_flow import Transformer2DModel
+from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
+# from tools.get_mulan import get_mulan
+from third_party.wespeaker.extract_embd import XVECModel
+# from libs.rvq2 import RVQEmbedding
+from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
+from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
+from models_gpt.models.gpt2_config import GPT2Config
+from torch.cuda.amp import autocast
+from our_MERT_BESTRQ.test import load_model
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        print("hidden_size:",config.hidden_size)
+        print("classifier_proj_size:",config.classifier_proj_size)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        """Project the original sample to the 'space' where the diffusion will happen."""
+        """Project back from diffusion space to the actual sample space."""
+        return z
+class Feature1DProcessor(SampleProcessor):
+    def __init__(self, dim: int = 100, power_std = 1., \
+                 num_samples: int = 100_000, cal_num_frames: int = 600):
+        super().__init__()
+        self.num_samples = num_samples
+        self.dim = dim
+        self.power_std = power_std
+        self.cal_num_frames = cal_num_frames
+        self.register_buffer('counts', torch.zeros(1))
+        self.register_buffer('sum_x', torch.zeros(dim))
+        self.register_buffer('sum_x2', torch.zeros(dim))
+        self.register_buffer('sum_target_x2', torch.zeros(dim))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        if(self.counts < 10):
+            mean = torch.zeros_like(mean)
+        return mean
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        if(self.counts < 10):
+            std = torch.ones_like(std)
+        return std
+    @property
+    def target_std(self):
+        return 1
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        if self.counts.item() < self.num_samples:
+            self.counts += len(x)
+            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
+            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
+        return x
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        rescale = (self.std / self.target_std) ** self.power_std
+        # print(rescale, self.mean)
+        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
+        return x
+def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
+    if(prior_text_encoder_hidden_states.shape[1]<len_size):
+        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
+            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
+            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
+            dtype=prior_text_encoder_hidden_states.dtype)],1)
+        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
+    else:
+        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
+        prior_text_mask = prior_text_mask[:,0:len_size]
+    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
+    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        estimator,
+        mlp,
+        ssl_layer
+    ):
+        super().__init__()
+        self.sigma_min = 1e-4
+        self.estimator = estimator
+        self.mlp = mlp
+        self.ssl_layer = ssl_layer
+    @torch.inference_mode()
+    def forward(self, mu, n_timesteps, temperature=1.0):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span)
+    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            print("incontext_x.shape:",incontext_x.shape)
+            print("noise.shape:",noise.shape)
+            print("t.shape:",t.shape)
+            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
+            if(guidance_scale > 1.0):
+                model_input = torch.cat([ \
+                    torch.cat([latent_mask_input, latent_mask_input], 0), \
+                    torch.cat([incontext_x, incontext_x], 0), \
+                    torch.cat([torch.zeros_like(mu), mu], 0), \
+                    torch.cat([x, x], 0), \
+                    ], 2)
+                timestep=t.unsqueeze(-1).repeat(2)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
+            else:
+                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
+                timestep=t.unsqueeze(-1)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
+            print("dphi_dt.shape:",dphi_dt.shape)
+            print("x.shape:",x.shape)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def projection_loss(self,hidden_proj, bestrq_emb):
+        bsz = hidden_proj.shape[0]
+        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
+        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
+        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
+        proj_loss = 1+proj_loss.mean()
+        return proj_loss
+    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        b = mu[0].shape[0]
+        len_x = x1.shape[2]
+        # random timestep
+        if(validation_mode):
+            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
+        else:
+            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # print("y.shape:",y.shape)
+        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
+        model_input = torch.cat([*mu,y], 2)
+        t=t.squeeze(-1).squeeze(-1)
+        # print("model_input.shape:",model_input.shape)
+        # print("attention_mask.shape:",attention_mask.shape)
+        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
+        hidden_layer = out.hidden_states[self.ssl_layer]
+        hidden_proj = self.mlp(hidden_layer)
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("mert_emb.shape:",mert_emb.shape)
+        # exit()
+        out = out.last_hidden_state
+        out=out[:,:,-len_x:]
+        # out=self.proj_out(out)
+        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
+        # print("out.shape",out.shape)
+        # print("u.shape",u.shape)
+        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
+        # print("hidden_proj.shape:",hidden_proj.shape)
+        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
+        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
+        loss = loss_re + loss_cos * 0.5
+        # print("loss_cos:",loss_cos,loss_cos.device)
+        print("loss:",loss,loss.device)
+        # exit()
+        return loss, loss_re, loss_cos
+class PromptCondAudioDiffusion(nn.Module):
+    def __init__(
+        self,
+        num_channels,
+        unet_model_name=None,
+        unet_model_config_path=None,
+        snr_gamma=None,
+        hubert_layer=None,
+        ssl_layer=None,
+        uncondition=True,
+        out_paint=False,
+    ):
+        super().__init__()
+        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
+        self.unet_model_name = unet_model_name
+        self.unet_model_config_path = unet_model_config_path
+        self.snr_gamma = snr_gamma
+        self.uncondition = uncondition
+        self.num_channels = num_channels
+        self.hubert_layer = hubert_layer
+        self.ssl_layer = ssl_layer
+        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
+        self.normfeat = Feature1DProcessor(dim=64)
+        self.sample_rate = 48000
+        self.num_samples_perseg = self.sample_rate * 20 // 1000
+        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
+        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        self.bestrq = load_model(
+            model_dir='path/to/our-MERT/mert_fairseq',
+            checkpoint_dir='checkpoint-120000.pt',
+        )
+        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
+        for v in self.bestrq.parameters():v.requires_grad = False
+        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        # for v in self.rvq_bestrq_emb.parameters():
+        #     print(v)
+        freeze_parameters='quantizers.0'
+        for name, param in self.rvq_bestrq_emb.named_parameters():
+            if freeze_parameters in name:
+                param.requires_grad = False
+                print("Freezing RVQ parameters:", name)
+        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
+        for v in self.hubert.parameters():v.requires_grad = False
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
+        # self.xvecmodel = XVECModel()
+        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
+        unet = GPT2Model(config)
+        mlp =  nn.Sequential(
+            nn.Linear(1200, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 768)
+        )
+        self.set_from = "random"
+        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
+        self.mask_emb = torch.nn.Embedding(3, 48)
+        print("Transformer initialized from pretrain.")
+        torch.cuda.empty_cache()
+        # self.unet.set_attn_processor(AttnProcessor2_0())
+        # self.unet.set_use_memory_efficient_attention_xformers(True)
+        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
+        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
+    def compute_snr(self, timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+    def preprocess_audio(self, input_audios, threshold=0.9):
+        assert len(input_audios.shape) == 2, input_audios.shape
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios/norm_value.unsqueeze(-1)
+    def extract_wav2vec_embeds(self, input_audios,output_len):
+        wav2vec_stride = 2
+        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
+        # print(wav2vec_embeds)
+        # print("audio.shape:",input_audios.shape)
+        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
+        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
+        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
+        return wav2vec_embeds_last
+    def extract_mert_embeds(self, input_audios):
+        prompt_stride = 3
+        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
+        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
+        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
+        mert_emb= prompt_embeds[-1]
+        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
+        return mert_emb
+    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
+        self.bestrq.eval()
+        # print("audio shape:",input_audio_0.shape)
+        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
+        # print("input_wav_mean.shape:",input_wav_mean.shape)
+        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
+        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
+        layer_results = input_wav_mean['layer_results']
+        # print("layer_results.shape:",layer_results[layer].shape)
+        bestrq_emb = layer_results[layer]
+        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
+        #[b,t,1024] t=t/960
+        #35.84s->batch,896,1024
+        return bestrq_emb
+    def extract_spk_embeds(self, input_audios):
+        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
+        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
+        return spk_embeds
+    def extract_lyric_feats(self, lyric):
+        with torch.no_grad():
+            try:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
+            except:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
+            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
+            text_mask = text_mask.to(self.device)
+            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
+                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
+            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
+            return text_encoder_hidden_states, text_mask
+    def extract_energy_bar(self, input_audios):
+        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
+            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        else:
+            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
+        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
+        energy_embedding = self.energy_embedding(energy_bar)
+        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
+        return energy_embedding
+    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
+        additional_feats = ['spk', 'lyric'], \
+        train_rvq=True, train_ssl=False,layer=5):
+        if not hasattr(self,"device"):
+            self.device = input_audios.device
+        if not hasattr(self,"dtype"):
+            self.dtype = input_audios.dtype
+        device = self.device
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
+        # energy_embedding = self.extract_energy_bar(input_audios)
+        # print("energy_embedding.shape:",energy_embedding.shape)
+        # with autocast(enabled=False):
+        if(train_ssl):
+            self.wav2vec.train()
+            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
+            self.clap_embd_extractor.train()
+            prompt_embeds = self.extract_mert_embeds(input_audios)
+            if('spk' in additional_feats):
+                self.xvecmodel.train()
+                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
+        else:
+            with torch.no_grad():
+                with autocast(enabled=False):
+                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
+                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
+                bestrq_emb = bestrq_emb.detach()
+        if('lyric' in additional_feats):
+            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
+        else:
+            text_encoder_hidden_states, text_mask = None, None
+        if(train_rvq):
+            random_num=random.random()
+            if(random_num<0.6):
+                rvq_layer = 1
+            elif(random_num<0.8):
+                rvq_layer = 2
+            else:
+                rvq_layer = 4
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
+        else:
+            bestrq_emb = bestrq_emb.float()
+            self.rvq_bestrq_emb.eval()
+            # with autocast(enabled=False):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
+            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
+            quantized_bestrq_emb = quantized_bestrq_emb.detach()
+        commitment_loss = commitment_loss_bestrq_emb
+        codebook_loss = codebook_loss_bestrq_emb
+        alpha=1
+        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        scenario = np.random.choice(['start_seg', 'other_seg'])
+        if(scenario == 'other_seg'):
+            for binx in range(input_audios.shape[0]):
+                # latent_masks[binx,0:64] = 1
+                latent_masks[binx,0:random.randint(64,128)] = 1
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
+        # print("latent_masks.shape:",latent_masks.shape)
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        if self.uncondition:
+            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
+            if len(mask_indices) > 0:
+                quantized_bestrq_emb[mask_indices] = 0
+        # print("latents.shape:",latents.shape)
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.project_sample(latents)
+        latents = latents.permute(0,2,1).contiguous()
+        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        # print("incontext_latents.shape:",incontext_latents.shape)
+        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        latent_mask_input = self.mask_emb(latent_masks)
+        #64+48+64+1024
+        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
+        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
+    def init_device_dtype(self, device, dtype):
+        self.device = device
+        self.dtype = dtype
+    @torch.no_grad()
+    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
+        input_audio_0 = input_audios[[0],:]
+        input_audio_1 = input_audios[[1],:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
+        bestrq_emb = bestrq_emb.detach()
+        # self.rvq_bestrq_middle.eval()
+        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
+        # self.rvq_bestrq_last.eval()
+        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
+        self.rvq_bestrq_emb.eval()
+        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
+        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
+        # exit()
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
+                  guidance_scale=2, num_steps=20,
+                  disable_progress=True, scenario='start_seg'):
+        classifier_free_guidance = guidance_scale > 1.0
+        device = self.device
+        dtype = self.dtype
+        # codes_bestrq_middle, codes_bestrq_last = codes
+        codes_bestrq_emb = codes[0]
+        batch_size = codes_bestrq_emb.shape[0]
+        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
+        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
+        if('spk' in additional_feats):
+            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
+        num_frames = quantized_bestrq_emb.shape[1]
+        num_channels_latents = self.num_channels
+        shape = (batch_size,  num_frames, 64)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
+        latent_masks[:,0:latent_length] = 2
+        if(scenario=='other_seg'):
+            latent_masks[:,0:incontext_length] = 1
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        true_latents = self.normfeat.project_sample(true_latents)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        latent_mask_input = self.mask_emb(latent_masks)
+        if('spk' in additional_feats):
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
+        else:
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb],1)
+        temperature = 1.0
+        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
+        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
+        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.return_sample(latents)
+        # latents = latents.permute(0,2,1).contiguous()
+        return latents
+    @torch.no_grad()
+    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents
+    @torch.no_grad()
+    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
+        import time
+        start = time.time()
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents,time.time()-start
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
+        divisor = 4
+        shape = (batch_size, num_channels_latents, num_frames, 32)
+        if(num_frames%divisor>0):
+            num_frames = round(num_frames/float(divisor))*divisor
+            shape = (batch_size, num_channels_latents, num_frames, 32)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        return latents

codeclm/tokenizer/Flow1dVAE/model_septoken.py CHANGED Viewed

@@ -1,670 +1,670 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-from diffusers.utils.torch_utils import randn_tensor
-from transformers import HubertModel
-from libs.rvq.descript_quantize3 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer_7 = out.hidden_states[7]
-        hidden_proj = self.mlp(hidden_layer_7)
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq',
-            checkpoint_dir='ckpt/encode-s12k.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        self.rvq_bestrq_bgm_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        self.hubert = HubertModelWithFinalProj.from_pretrained("ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=16,n_head=20,n_embd=2200,n_inner=4400)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(2200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp)
-        self.mask_emb = torch.nn.Embedding(3, 24)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        wav2vec_embeds_last=wav2vec_embeds[-1]
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=375, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_vocal_0,input_audio_vocal_1,layer):
-        input_wav_mean = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios_vocal,input_audios_bgm, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer_vocal=7,layer_bgm=7):
-        if not hasattr(self,"device"):
-            self.device = input_audios_vocal.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios_vocal.dtype
-        device = self.device
-        input_audio_vocal_0 = input_audios_vocal[:,0,:]
-        input_audio_vocal_1 = input_audios_vocal[:,1,:]
-        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
-        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
-        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
-        input_audio_bgm_0 = input_audios_bgm[:,0,:]
-        input_audio_bgm_1 = input_audios_bgm[:,1,:]
-        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
-        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
-        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
-                    bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    output_len = bestrq_emb.shape[2]
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_vocal_wav2vec+input_audios_bgm_wav2vec,output_len)
-                bestrq_emb = bestrq_emb.detach()
-                bestrq_emb_bgm = bestrq_emb_bgm.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        if(train_rvq):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            quantized_bestrq_emb_bgm, _, _, commitment_loss_bestrq_emb_bgm, codebook_loss_bestrq_emb_bgm,_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb+commitment_loss_bestrq_emb_bgm
-        codebook_loss = codebook_loss_bestrq_emb+codebook_loss_bestrq_emb_bgm
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm * alpha + bestrq_emb_bgm * (1-alpha)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios_vocal.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm.permute(0,2,1).contiguous()
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        quantized_bestrq_emb_bgm = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb_bgm \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-                quantized_bestrq_emb_bgm[mask_indices] = 0
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb,quantized_bestrq_emb_bgm],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios_vocal,input_audios_bgm, additional_feats,layer_vocal=7,layer_bgm=7):
-        input_audio_vocal_0 = input_audios_vocal[[0],:]
-        input_audio_vocal_1 = input_audios_vocal[[1],:]
-        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
-        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
-        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
-        input_audio_bgm_0 = input_audios_bgm[[0],:]
-        input_audio_bgm_1 = input_audios_bgm[[1],:]
-        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
-        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
-        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
-        bestrq_emb = bestrq_emb.detach()
-        bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
-        bestrq_emb_bgm = bestrq_emb_bgm.detach()
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-        self.rvq_bestrq_bgm_emb.eval()
-        quantized_bestrq_emb_bgm, codes_bestrq_emb_bgm, *_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb,codes_bestrq_emb_bgm], [bestrq_emb,bestrq_emb_bgm], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios_vocal, input_audios_bgm, additional_feats,layer_vocal=7,layer_bgm=7):
-        input_audio_vocal_0 = input_audios_vocal[:,0,:]
-        input_audio_vocal_1 = input_audios_vocal[:,1,:]
-        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
-        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
-        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
-        input_audio_bgm_0 = input_audios_bgm[:,0,:]
-        input_audio_bgm_1 = input_audios_bgm[:,1,:]
-        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
-        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
-        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
-        bestrq_emb = bestrq_emb.detach()
-        bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
-        bestrq_emb_bgm = bestrq_emb_bgm.detach()
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-        self.rvq_bestrq_bgm_emb.eval()
-        quantized_bestrq_emb_bgm, codes_bestrq_emb_bgm, *_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb,codes_bestrq_emb_bgm], [bestrq_emb,bestrq_emb_bgm], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats,incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb,codes_bestrq_emb_bgm = codes
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        quantized_bestrq_emb_bgm,_,_=self.rvq_bestrq_bgm_emb.from_codes(codes_bestrq_emb_bgm)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm.permute(0,2,1).contiguous()
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        quantized_bestrq_emb_bgm = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb_bgm \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb,quantized_bestrq_emb_bgm, spk_embeds],2)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb,quantized_bestrq_emb_bgm],2)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios_vocal,input_audios_bgm, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer_vocal=7,layer_bgm=3,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios_vocal,input_audios_bgm, additional_feats,layer_vocal,layer_bgm)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

+import yaml
+import random
+import inspect
+import numpy as np
+from tqdm import tqdm
+import typing as tp
+from abc import ABC
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from einops import repeat
+from tools.torch_tools import wav_to_fbank
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import HubertModel
+from libs.rvq.descript_quantize3 import ResidualVectorQuantize
+from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
+from models_gpt.models.gpt2_config import GPT2Config
+from torch.cuda.amp import autocast
+from our_MERT_BESTRQ.test import load_model
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        print("hidden_size:",config.hidden_size)
+        print("classifier_proj_size:",config.classifier_proj_size)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        """Project the original sample to the 'space' where the diffusion will happen."""
+        """Project back from diffusion space to the actual sample space."""
+        return z
+class Feature1DProcessor(SampleProcessor):
+    def __init__(self, dim: int = 100, power_std = 1., \
+                 num_samples: int = 100_000, cal_num_frames: int = 600):
+        super().__init__()
+        self.num_samples = num_samples
+        self.dim = dim
+        self.power_std = power_std
+        self.cal_num_frames = cal_num_frames
+        self.register_buffer('counts', torch.zeros(1))
+        self.register_buffer('sum_x', torch.zeros(dim))
+        self.register_buffer('sum_x2', torch.zeros(dim))
+        self.register_buffer('sum_target_x2', torch.zeros(dim))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        if(self.counts < 10):
+            mean = torch.zeros_like(mean)
+        return mean
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        if(self.counts < 10):
+            std = torch.ones_like(std)
+        return std
+    @property
+    def target_std(self):
+        return 1
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        if self.counts.item() < self.num_samples:
+            self.counts += len(x)
+            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
+            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
+        return x
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        rescale = (self.std / self.target_std) ** self.power_std
+        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
+        return x
+def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
+    if(prior_text_encoder_hidden_states.shape[1]<len_size):
+        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
+            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
+            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
+            dtype=prior_text_encoder_hidden_states.dtype)],1)
+        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
+    else:
+        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
+        prior_text_mask = prior_text_mask[:,0:len_size]
+    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
+    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        estimator,
+        mlp
+    ):
+        super().__init__()
+        self.sigma_min = 1e-4
+        self.estimator = estimator
+        self.mlp = mlp
+    @torch.inference_mode()
+    def forward(self, mu, n_timesteps, temperature=1.0):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span)
+    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
+            if(guidance_scale > 1.0):
+                model_input = torch.cat([ \
+                    torch.cat([latent_mask_input, latent_mask_input], 0), \
+                    torch.cat([incontext_x, incontext_x], 0), \
+                    torch.cat([torch.zeros_like(mu), mu], 0), \
+                    torch.cat([x, x], 0), \
+                    ], 2)
+                timestep=t.unsqueeze(-1).repeat(2)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
+            else:
+                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
+                timestep=t.unsqueeze(-1)
+                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
+            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def projection_loss(self,hidden_proj, bestrq_emb):
+        bsz = hidden_proj.shape[0]
+        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
+        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
+        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
+        proj_loss = 1+proj_loss.mean()
+        return proj_loss
+    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        b = mu[0].shape[0]
+        len_x = x1.shape[2]
+        # random timestep
+        if(validation_mode):
+            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
+        else:
+            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        model_input = torch.cat([*mu,y], 2)
+        t=t.squeeze(-1).squeeze(-1)
+        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
+        hidden_layer_7 = out.hidden_states[7]
+        hidden_proj = self.mlp(hidden_layer_7)
+        out = out.last_hidden_state
+        out=out[:,:,-len_x:]
+        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
+        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
+        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
+        loss = loss_re + loss_cos * 0.5
+        return loss, loss_re, loss_cos
+class PromptCondAudioDiffusion(nn.Module):
+    def __init__(
+        self,
+        num_channels,
+        unet_model_name=None,
+        unet_model_config_path=None,
+        snr_gamma=None,
+        uncondition=True,
+        out_paint=False,
+    ):
+        super().__init__()
+        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
+        self.unet_model_name = unet_model_name
+        self.unet_model_config_path = unet_model_config_path
+        self.snr_gamma = snr_gamma
+        self.uncondition = uncondition
+        self.num_channels = num_channels
+        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
+        self.normfeat = Feature1DProcessor(dim=64)
+        self.sample_rate = 48000
+        self.num_samples_perseg = self.sample_rate * 20 // 1000
+        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
+        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
+        self.bestrq = load_model(
+            model_dir='codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq',
+            checkpoint_dir='ckpt/encode-s12k.pt',
+        )
+        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
+        for v in self.bestrq.parameters():v.requires_grad = False
+        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        self.rvq_bestrq_bgm_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        self.hubert = HubertModelWithFinalProj.from_pretrained("ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
+        for v in self.hubert.parameters():v.requires_grad = False
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
+        # self.xvecmodel = XVECModel()
+        config = GPT2Config(n_positions=1000,n_layer=16,n_head=20,n_embd=2200,n_inner=4400)
+        unet = GPT2Model(config)
+        mlp =  nn.Sequential(
+            nn.Linear(2200, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 1024),
+            nn.SiLU(),
+            nn.Linear(1024, 768)
+        )
+        self.set_from = "random"
+        self.cfm_wrapper = BASECFM(unet, mlp)
+        self.mask_emb = torch.nn.Embedding(3, 24)
+        print("Transformer initialized from pretrain.")
+        torch.cuda.empty_cache()
+    def compute_snr(self, timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+    def preprocess_audio(self, input_audios, threshold=0.9):
+        assert len(input_audios.shape) == 2, input_audios.shape
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios/norm_value.unsqueeze(-1)
+    def extract_wav2vec_embeds(self, input_audios,output_len):
+        wav2vec_stride = 2
+        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
+        wav2vec_embeds_last=wav2vec_embeds[-1]
+        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
+        return wav2vec_embeds_last
+    def extract_mert_embeds(self, input_audios):
+        prompt_stride = 3
+        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
+        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
+        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
+        mert_emb= prompt_embeds[-1]
+        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=375, mode='linear', align_corners=False).permute(0, 2, 1)
+        return mert_emb
+    def extract_bestrq_embeds(self, input_audio_vocal_0,input_audio_vocal_1,layer):
+        input_wav_mean = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
+        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
+        layer_results = input_wav_mean['layer_results']
+        bestrq_emb = layer_results[layer]
+        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
+        return bestrq_emb
+    def extract_spk_embeds(self, input_audios):
+        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
+        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
+        return spk_embeds
+    def extract_lyric_feats(self, lyric):
+        with torch.no_grad():
+            try:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
+            except:
+                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
+            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
+            text_mask = text_mask.to(self.device)
+            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
+                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
+            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
+            return text_encoder_hidden_states, text_mask
+    def extract_energy_bar(self, input_audios):
+        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
+            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        else:
+            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
+        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
+        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
+        energy_embedding = self.energy_embedding(energy_bar)
+        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
+        return energy_embedding
+    def forward(self, input_audios_vocal,input_audios_bgm, lyric, latents, latent_masks, validation_mode=False, \
+        additional_feats = ['spk', 'lyric'], \
+        train_rvq=True, train_ssl=False,layer_vocal=7,layer_bgm=7):
+        if not hasattr(self,"device"):
+            self.device = input_audios_vocal.device
+        if not hasattr(self,"dtype"):
+            self.dtype = input_audios_vocal.dtype
+        device = self.device
+        input_audio_vocal_0 = input_audios_vocal[:,0,:]
+        input_audio_vocal_1 = input_audios_vocal[:,1,:]
+        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
+        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
+        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
+        input_audio_bgm_0 = input_audios_bgm[:,0,:]
+        input_audio_bgm_1 = input_audios_bgm[:,1,:]
+        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
+        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
+        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
+        if(train_ssl):
+            self.wav2vec.train()
+            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
+            self.clap_embd_extractor.train()
+            prompt_embeds = self.extract_mert_embeds(input_audios)
+            if('spk' in additional_feats):
+                self.xvecmodel.train()
+                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
+        else:
+            with torch.no_grad():
+                with autocast(enabled=False):
+                    bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
+                    bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
+                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
+                    output_len = bestrq_emb.shape[2]
+                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_vocal_wav2vec+input_audios_bgm_wav2vec,output_len)
+                bestrq_emb = bestrq_emb.detach()
+                bestrq_emb_bgm = bestrq_emb_bgm.detach()
+        if('lyric' in additional_feats):
+            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
+        else:
+            text_encoder_hidden_states, text_mask = None, None
+        if(train_rvq):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+            quantized_bestrq_emb_bgm, _, _, commitment_loss_bestrq_emb_bgm, codebook_loss_bestrq_emb_bgm,_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
+        else:
+            bestrq_emb = bestrq_emb.float()
+            self.rvq_bestrq_emb.eval()
+            # with autocast(enabled=False):
+            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
+            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
+            quantized_bestrq_emb = quantized_bestrq_emb.detach()
+        commitment_loss = commitment_loss_bestrq_emb+commitment_loss_bestrq_emb_bgm
+        codebook_loss = codebook_loss_bestrq_emb+codebook_loss_bestrq_emb_bgm
+        alpha=1
+        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
+        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm * alpha + bestrq_emb_bgm * (1-alpha)
+        scenario = np.random.choice(['start_seg', 'other_seg'])
+        if(scenario == 'other_seg'):
+            for binx in range(input_audios_vocal.shape[0]):
+                # latent_masks[binx,0:64] = 1
+                latent_masks[binx,0:random.randint(64,128)] = 1
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm.permute(0,2,1).contiguous()
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        quantized_bestrq_emb_bgm = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb_bgm \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        if self.uncondition:
+            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
+            if len(mask_indices) > 0:
+                quantized_bestrq_emb[mask_indices] = 0
+                quantized_bestrq_emb_bgm[mask_indices] = 0
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.project_sample(latents)
+        latents = latents.permute(0,2,1).contiguous()
+        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        latent_mask_input = self.mask_emb(latent_masks)
+        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb,quantized_bestrq_emb_bgm],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
+        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
+    def init_device_dtype(self, device, dtype):
+        self.device = device
+        self.dtype = dtype
+    @torch.no_grad()
+    def fetch_codes(self, input_audios_vocal,input_audios_bgm, additional_feats,layer_vocal=7,layer_bgm=7):
+        input_audio_vocal_0 = input_audios_vocal[[0],:]
+        input_audio_vocal_1 = input_audios_vocal[[1],:]
+        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
+        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
+        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
+        input_audio_bgm_0 = input_audios_bgm[[0],:]
+        input_audio_bgm_1 = input_audios_bgm[[1],:]
+        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
+        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
+        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
+        bestrq_emb = bestrq_emb.detach()
+        bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
+        bestrq_emb_bgm = bestrq_emb_bgm.detach()
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+        self.rvq_bestrq_bgm_emb.eval()
+        quantized_bestrq_emb_bgm, codes_bestrq_emb_bgm, *_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb,codes_bestrq_emb_bgm], [bestrq_emb,bestrq_emb_bgm], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch(self, input_audios_vocal, input_audios_bgm, additional_feats,layer_vocal=7,layer_bgm=7):
+        input_audio_vocal_0 = input_audios_vocal[:,0,:]
+        input_audio_vocal_1 = input_audios_vocal[:,1,:]
+        input_audio_vocal_0 = self.preprocess_audio(input_audio_vocal_0)
+        input_audio_vocal_1 = self.preprocess_audio(input_audio_vocal_1)
+        input_audios_vocal_wav2vec = (input_audio_vocal_0 + input_audio_vocal_1) / 2.0
+        input_audio_bgm_0 = input_audios_bgm[:,0,:]
+        input_audio_bgm_1 = input_audios_bgm[:,1,:]
+        input_audio_bgm_0 = self.preprocess_audio(input_audio_bgm_0)
+        input_audio_bgm_1 = self.preprocess_audio(input_audio_bgm_1)
+        input_audios_bgm_wav2vec = (input_audio_bgm_0 + input_audio_bgm_1) / 2.0
+        self.bestrq.eval()
+        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
+        # bestrq_middle = bestrq_middle.detach()
+        # bestrq_last = bestrq_last.detach()
+        bestrq_emb = self.extract_bestrq_embeds(input_audio_vocal_0,input_audio_vocal_1,layer_vocal)
+        bestrq_emb = bestrq_emb.detach()
+        bestrq_emb_bgm = self.extract_bestrq_embeds(input_audio_bgm_0,input_audio_bgm_1,layer_bgm)
+        bestrq_emb_bgm = bestrq_emb_bgm.detach()
+        self.rvq_bestrq_emb.eval()
+        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
+        self.rvq_bestrq_bgm_emb.eval()
+        quantized_bestrq_emb_bgm, codes_bestrq_emb_bgm, *_ = self.rvq_bestrq_bgm_emb(bestrq_emb_bgm) # b,d,t
+        if('spk' in additional_feats):
+            self.xvecmodel.eval()
+            spk_embeds = self.extract_spk_embeds(input_audios)
+        else:
+            spk_embeds = None
+        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
+        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
+        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
+        return [codes_bestrq_emb,codes_bestrq_emb_bgm], [bestrq_emb,bestrq_emb_bgm], spk_embeds
+        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
+    @torch.no_grad()
+    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats,incontext_length=127,
+                  guidance_scale=2, num_steps=20,
+                  disable_progress=True, scenario='start_seg'):
+        classifier_free_guidance = guidance_scale > 1.0
+        device = self.device
+        dtype = self.dtype
+        # codes_bestrq_middle, codes_bestrq_last = codes
+        codes_bestrq_emb,codes_bestrq_emb_bgm = codes
+        batch_size = codes_bestrq_emb.shape[0]
+        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
+        quantized_bestrq_emb_bgm,_,_=self.rvq_bestrq_bgm_emb.from_codes(codes_bestrq_emb_bgm)
+        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
+        quantized_bestrq_emb_bgm = quantized_bestrq_emb_bgm.permute(0,2,1).contiguous()
+        if('spk' in additional_feats):
+            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
+        num_frames = quantized_bestrq_emb.shape[1]
+        num_channels_latents = self.num_channels
+        shape = (batch_size,  num_frames, 64)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
+        latent_masks[:,0:latent_length] = 2
+        if(scenario=='other_seg'):
+            latent_masks[:,0:incontext_length] = 1
+        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        quantized_bestrq_emb_bgm = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb_bgm \
+            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        true_latents = self.normfeat.project_sample(true_latents)
+        true_latents = true_latents.permute(0,2,1).contiguous()
+        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        attention_mask=(latent_masks > 0.5)
+        B, L = attention_mask.size()
+        attention_mask = attention_mask.view(B, 1, L)
+        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
+        attention_mask = attention_mask.unsqueeze(1)
+        latent_mask_input = self.mask_emb(latent_masks)
+        if('spk' in additional_feats):
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb,quantized_bestrq_emb_bgm, spk_embeds],2)
+        else:
+            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
+            additional_model_input = torch.cat([quantized_bestrq_emb,quantized_bestrq_emb_bgm],2)
+        temperature = 1.0
+        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
+        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
+        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
+        latents = latents.permute(0,2,1).contiguous()
+        latents = self.normfeat.return_sample(latents)
+        # latents = latents.permute(0,2,1).contiguous()
+        return latents
+    @torch.no_grad()
+    def inference(self, input_audios_vocal,input_audios_bgm, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer_vocal=7,layer_bgm=3,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios_vocal,input_audios_bgm, additional_feats,layer_vocal,layer_bgm)
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
+        divisor = 4
+        shape = (batch_size, num_channels_latents, num_frames, 32)
+        if(num_frames%divisor>0):
+            num_frames = round(num_frames/float(divisor))*divisor
+            shape = (batch_size, num_channels_latents, num_frames, 32)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        return latents

codeclm/tokenizer/Flow1dVAE/models/unet_2d_condition_additionalemb.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

codeclm/tokenizer/Flow1dVAE/models/unet_2d_condition_flow.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py CHANGED Viewed

@@ -1,71 +1,71 @@
-_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
-_initials = [
-    "^",
-    "b",
-    "c",
-    "ch",
-    "d",
-    "f",
-    "g",
-    "h",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "p",
-    "q",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "x",
-    "z",
-    "zh",
-]
-_tones = ["1", "2", "3", "4", "5"]
-_finals = [
-    "a",
-    "ai",
-    "an",
-    "ang",
-    "ao",
-    "e",
-    "ei",
-    "en",
-    "eng",
-    "er",
-    "i",
-    "ia",
-    "ian",
-    "iang",
-    "iao",
-    "ie",
-    "ii",
-    "iii",
-    "in",
-    "ing",
-    "iong",
-    "iou",
-    "o",
-    "ong",
-    "ou",
-    "u",
-    "ua",
-    "uai",
-    "uan",
-    "uang",
-    "uei",
-    "uen",
-    "ueng",
-    "uo",
-    "v",
-    "van",
-    "ve",
-    "vn",
-]
-symbols = _pause + _initials + [i + j for i in _finals for j in _tones]

+_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
+_initials = [
+    "^",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "x",
+    "z",
+    "zh",
+]
+_tones = ["1", "2", "3", "4", "5"]
+_finals = [
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "ii",
+    "iii",
+    "in",
+    "ing",
+    "iong",
+    "iou",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "uei",
+    "uen",
+    "ueng",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+symbols = _pause + _initials + [i + j for i in _finals for j in _tones]

codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k.py CHANGED Viewed

@@ -1,47 +1,47 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from tools.get_bsrnnvae import get_bsrnnvae
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 44100
-        self.device = device
-        self.vae = get_bsrnnvae()
-        self.vae = self.vae.eval().to(device)
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=15.36, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(fs!=44100):
-                orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
-                fs = 44100
-            if(orig_samples.shape[-1]<int(duration*44100*2)):
-                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
-                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
-            # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*2*44100)]
-            audio = self.vae(orig_samples[:,None,:])[:,0,:]
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from tools.get_bsrnnvae import get_bsrnnvae
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 44100
+        self.device = device
+        self.vae = get_bsrnnvae()
+        self.vae = self.vae.eval().to(device)
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=15.36, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(fs!=44100):
+                orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
+                fs = 44100
+            if(orig_samples.shape[-1]<int(duration*44100*2)):
+                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
+                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
+            # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*2*44100)]
+            audio = self.vae(orig_samples[:,None,:])[:,0,:]
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k_vocal.py CHANGED Viewed

@@ -1,47 +1,47 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from tools.get_bsrnnvae import get_bsrnnvae
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 44100
-        self.device = device
-        self.vae = get_bsrnnvae()
-        self.vae = self.vae.eval().to(device)
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=20.48, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(fs!=44100):
-                orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
-                fs = 44100
-            if(orig_samples.shape[-1]<int(duration*44100*2)):
-                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
-                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
-            # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*2*44100)]
-            audio = self.vae(orig_samples[:,None,:])[:,0,:]
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from tools.get_bsrnnvae import get_bsrnnvae
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 44100
+        self.device = device
+        self.vae = get_bsrnnvae()
+        self.vae = self.vae.eval().to(device)
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=20.48, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(fs!=44100):
+                orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
+                fs = 44100
+            if(orig_samples.shape[-1]<int(duration*44100*2)):
+                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
+                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
+            # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*2*44100)]
+            audio = self.vae(orig_samples[:,None,:])[:,0,:]
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_speech.py CHANGED Viewed

@@ -1,56 +1,56 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000)):
-                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*48000)]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000)):
+                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*48000)]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_vocal.py CHANGED Viewed

@@ -1,57 +1,57 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000*2)):
-                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
-                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*2*48000)]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000*2)):
+                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
+                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*2*48000)]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k.py CHANGED Viewed

@@ -1,59 +1,59 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000*3)):
-                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000*3)/float(orig_samples.shape[-1])))
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],int(0*48000):int(duration*3*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],:]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
-            mel = self.vae.decode_first_stage(latents)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            orig_samples = orig_samples[...,0:int(duration * 3 * 48000)]
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000*3)):
+                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000*3)/float(orig_samples.shape[-1])))
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],int(0*48000):int(duration*3*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],:]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
+            mel = self.vae.decode_first_stage(latents)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            orig_samples = orig_samples[...,0:int(duration * 3 * 48000)]
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_soundmusic.py CHANGED Viewed

@@ -1,61 +1,61 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-        # print(sum(p.numel() for p in self.vae.parameters()));exit()
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000)):
-                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],int(0*48000):int(duration*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],:]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
-            mel = self.vae.decode_first_stage(latents)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            orig_samples = orig_samples[...,0:int(duration * 48000)]
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+        # print(sum(p.numel() for p in self.vae.parameters()));exit()
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000)):
+                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],int(0*48000):int(duration*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],:]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
+            mel = self.vae.decode_first_stage(latents)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            orig_samples = orig_samples[...,0:int(duration * 48000)]
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_speech.py CHANGED Viewed

@@ -1,58 +1,58 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000)):
-                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*48000)]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
-            mel = self.vae.decode_first_stage(latents)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000)):
+                orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*48000)]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
+            mel = self.vae.decode_first_stage(latents)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_vocal.py CHANGED Viewed

@@ -1,59 +1,59 @@
-import json
-import torch
-from tqdm import tqdm
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-class Tango:
-    def __init__(self, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        self.device = device
-        self.vae, self.stft = build_pretrained_models()
-        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().float()
-        return waveform
-    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
-        """ Genrate audio without condition. """
-        num_frames = math.ceil(duration * 100. / 8)
-        with torch.no_grad():
-            orig_samples, fs = torchaudio.load(fname)
-            if(orig_samples.shape[-1]<int(duration*48000*2)):
-                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
-                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
-            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
-            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
-            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
-            resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
-            orig_samples = orig_samples[[0],0:int(duration*2*48000)]
-            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
-            mel = mel.unsqueeze(1).to(self.device)
-            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
-            mel = self.vae.decode_first_stage(latents)
-            audio = self.vae.decode_to_waveform(mel)
-            audio = torch.from_numpy(audio)
-            if(orig_samples.shape[-1]<audio.shape[-1]):
-                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
-            else:
-                orig_samples = orig_samples[:,0:audio.shape[-1]]
-            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
-        return output

+import json
+import torch
+from tqdm import tqdm
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+class Tango:
+    def __init__(self, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        self.device = device
+        self.vae, self.stft = build_pretrained_models()
+        self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+    def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
+        """ Genrate audio without condition. """
+        num_frames = math.ceil(duration * 100. / 8)
+        with torch.no_grad():
+            orig_samples, fs = torchaudio.load(fname)
+            if(orig_samples.shape[-1]<int(duration*48000*2)):
+                orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
+                    dtype=orig_samples.dtype, device=orig_samples.device)], -1)
+            # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
+            if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
+            # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
+            resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
+            orig_samples = orig_samples[[0],0:int(duration*2*48000)]
+            mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
+            mel = mel.unsqueeze(1).to(self.device)
+            latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
+            mel = self.vae.decode_first_stage(latents)
+            audio = self.vae.decode_to_waveform(mel)
+            audio = torch.from_numpy(audio)
+            if(orig_samples.shape[-1]<audio.shape[-1]):
+                orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
+            else:
+                orig_samples = orig_samples[:,0:audio.shape[-1]]
+            output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
+        return output

codeclm/tokenizer/Flow1dVAE/tools/mix.py CHANGED Viewed

@@ -1,51 +1,51 @@
-import numpy as np
-def a_weight(fs, n_fft, min_db=-80.0):
-    freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
-    freq_sq = np.power(freq, 2)
-    freq_sq[0] = 1.0
-    weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
-                           - np.log10(freq_sq + 12194 ** 2)
-                           - np.log10(freq_sq + 20.6 ** 2)
-                           - 0.5 * np.log10(freq_sq + 107.7 ** 2)
-                           - 0.5 * np.log10(freq_sq + 737.9 ** 2))
-    weight = np.maximum(weight, min_db)
-    return weight
-def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
-    if fs == 16000:
-        n_fft = 2048
-    elif fs == 44100:
-        n_fft = 4096
-    else:
-        raise Exception("Invalid fs {}".format(fs))
-    stride = n_fft // 2
-    gain = []
-    for i in range(0, len(sound) - n_fft + 1, stride):
-        if mode == "RMSE":
-            g = np.mean(sound[i: i + n_fft] ** 2)
-        elif mode == "A_weighting":
-            spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
-            power_spec = np.abs(spec) ** 2
-            a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
-            g = np.sum(a_weighted_spec)
-        else:
-            raise Exception("Invalid mode {}".format(mode))
-        gain.append(g)
-    gain = np.array(gain)
-    gain = np.maximum(gain, np.power(10, min_db / 10))
-    gain_db = 10 * np.log10(gain)
-    return gain_db
-def mix(sound1, sound2, r, fs):
-    gain1 = np.max(compute_gain(sound1, fs))  # Decibel
-    gain2 = np.max(compute_gain(sound2, fs))
-    t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
-    sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
     return sound

+import numpy as np
+def a_weight(fs, n_fft, min_db=-80.0):
+    freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
+    freq_sq = np.power(freq, 2)
+    freq_sq[0] = 1.0
+    weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
+                           - np.log10(freq_sq + 12194 ** 2)
+                           - np.log10(freq_sq + 20.6 ** 2)
+                           - 0.5 * np.log10(freq_sq + 107.7 ** 2)
+                           - 0.5 * np.log10(freq_sq + 737.9 ** 2))
+    weight = np.maximum(weight, min_db)
+    return weight
+def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
+    if fs == 16000:
+        n_fft = 2048
+    elif fs == 44100:
+        n_fft = 4096
+    else:
+        raise Exception("Invalid fs {}".format(fs))
+    stride = n_fft // 2
+    gain = []
+    for i in range(0, len(sound) - n_fft + 1, stride):
+        if mode == "RMSE":
+            g = np.mean(sound[i: i + n_fft] ** 2)
+        elif mode == "A_weighting":
+            spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
+            power_spec = np.abs(spec) ** 2
+            a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
+            g = np.sum(a_weighted_spec)
+        else:
+            raise Exception("Invalid mode {}".format(mode))
+        gain.append(g)
+    gain = np.array(gain)
+    gain = np.maximum(gain, np.power(10, min_db / 10))
+    gain_db = 10 * np.log10(gain)
+    return gain_db
+def mix(sound1, sound2, r, fs):
+    gain1 = np.max(compute_gain(sound1, fs))  # Decibel
+    gain2 = np.max(compute_gain(sound2, fs))
+    t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
+    sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
     return sound

codeclm/tokenizer/Flow1dVAE/tools/torch_tools.py CHANGED Viewed

@@ -1,143 +1,143 @@
-import torch
-import torchaudio
-import random
-import itertools
-import numpy as np
-from tools.mix import mix
-def normalize_wav(waveform):
-    waveform = waveform - torch.mean(waveform)
-    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
-    return waveform * 0.5
-def pad_wav(waveform, segment_length):
-    waveform_length = len(waveform)
-    if segment_length is None or waveform_length == segment_length:
-        return waveform
-    elif waveform_length > segment_length:
-        return waveform[:segment_length]
-    else:
-        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
-        waveform = torch.cat([waveform, pad_wav])
-        return waveform
-def _pad_spec(fbank, target_length=1024):
-    batch, n_frames, channels = fbank.shape
-    p = target_length - n_frames
-    if p > 0:
-        pad = torch.zeros(batch, p, channels).to(fbank.device)
-        fbank = torch.cat([fbank, pad], 1)
-    elif p < 0:
-        fbank = fbank[:, :target_length, :]
-    if channels % 2 != 0:
-        fbank = fbank[:, :, :-1]
-    return fbank
-def read_wav_file(filename, segment_length):
-    waveform, sr = torchaudio.load(filename)  # Faster!!!
-    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0]
-    try:
-        waveform = normalize_wav(waveform)
-    except:
-        print ("Exception normalizing:", filename)
-        waveform = torch.ones(160000)
-    waveform = pad_wav(waveform, segment_length).unsqueeze(0)
-    waveform = waveform / torch.max(torch.abs(waveform))
-    waveform = 0.5 * waveform
-    return waveform
-def get_mel_from_wav(audio, _stft):
-    audio = torch.nan_to_num(torch.clip(audio, -1, 1))
-    audio = torch.autograd.Variable(audio, requires_grad=False)
-    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
-    return melspec, log_magnitudes_stft, energy
-def wav_to_fbank(paths, target_length=1024, fn_STFT=None):
-    assert fn_STFT is not None
-    waveform = torch.cat([read_wav_file(path, target_length * 160) for path in paths], 0)  # hop size is 160
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = fbank.transpose(1, 2)
-    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
-    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-        log_magnitudes_stft, target_length
-    )
-    return fbank, log_magnitudes_stft, waveform
-def wav_to_fbank2(waveform, target_length=-1, fn_STFT=None):
-    assert fn_STFT is not None
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = fbank.transpose(1, 2)
-    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
-    # print(fbank.shape, log_magnitudes_stft.shape)
-    if(target_length>0):
-        fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-            log_magnitudes_stft, target_length
-        )
-    return fbank, log_magnitudes_stft, waveform
-def uncapitalize(s):
-    if s:
-        return s[:1].lower() + s[1:]
-    else:
-        return ""
-def mix_wavs_and_captions(path1, path2, caption1, caption2, target_length=1024):
-    sound1 = read_wav_file(path1, target_length * 160)[0].numpy()
-    sound2 = read_wav_file(path2, target_length * 160)[0].numpy()
-    mixed_sound = mix(sound1, sound2, 0.5, 16000).reshape(1, -1)
-    mixed_caption = "{} and {}".format(caption1, uncapitalize(caption2))
-    return mixed_sound, mixed_caption
-def augment(paths, texts, num_items=4, target_length=1024):
-    mixed_sounds, mixed_captions = [], []
-    combinations = list(itertools.combinations(list(range(len(texts))), 2))
-    random.shuffle(combinations)
-    if len(combinations) < num_items:
-        selected_combinations = combinations
-    else:
-        selected_combinations = combinations[:num_items]
-    for (i, j) in selected_combinations:
-        new_sound, new_caption = mix_wavs_and_captions(paths[i], paths[j], texts[i], texts[j], target_length)
-        mixed_sounds.append(new_sound)
-        mixed_captions.append(new_caption)
-    waveform = torch.tensor(np.concatenate(mixed_sounds, 0))
-    waveform = waveform / torch.max(torch.abs(waveform))
-    waveform = 0.5 * waveform
-    return waveform, mixed_captions
-def augment_wav_to_fbank(paths, texts, num_items=4, target_length=1024, fn_STFT=None):
-    assert fn_STFT is not None
-    waveform, captions = augment(paths, texts)
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = fbank.transpose(1, 2)
-    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
-    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-        log_magnitudes_stft, target_length
-    )
     return fbank, log_magnitudes_stft, waveform, captions

+import torch
+import torchaudio
+import random
+import itertools
+import numpy as np
+from tools.mix import mix
+def normalize_wav(waveform):
+    waveform = waveform - torch.mean(waveform)
+    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+def pad_wav(waveform, segment_length):
+    waveform_length = len(waveform)
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    else:
+        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
+        waveform = torch.cat([waveform, pad_wav])
+        return waveform
+def _pad_spec(fbank, target_length=1024):
+    batch, n_frames, channels = fbank.shape
+    p = target_length - n_frames
+    if p > 0:
+        pad = torch.zeros(batch, p, channels).to(fbank.device)
+        fbank = torch.cat([fbank, pad], 1)
+    elif p < 0:
+        fbank = fbank[:, :target_length, :]
+    if channels % 2 != 0:
+        fbank = fbank[:, :, :-1]
+    return fbank
+def read_wav_file(filename, segment_length):
+    waveform, sr = torchaudio.load(filename)  # Faster!!!
+    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0]
+    try:
+        waveform = normalize_wav(waveform)
+    except:
+        print ("Exception normalizing:", filename)
+        waveform = torch.ones(160000)
+    waveform = pad_wav(waveform, segment_length).unsqueeze(0)
+    waveform = waveform / torch.max(torch.abs(waveform))
+    waveform = 0.5 * waveform
+    return waveform
+def get_mel_from_wav(audio, _stft):
+    audio = torch.nan_to_num(torch.clip(audio, -1, 1))
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
+    return melspec, log_magnitudes_stft, energy
+def wav_to_fbank(paths, target_length=1024, fn_STFT=None):
+    assert fn_STFT is not None
+    waveform = torch.cat([read_wav_file(path, target_length * 160) for path in paths], 0)  # hop size is 160
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = fbank.transpose(1, 2)
+    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
+    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+        log_magnitudes_stft, target_length
+    )
+    return fbank, log_magnitudes_stft, waveform
+def wav_to_fbank2(waveform, target_length=-1, fn_STFT=None):
+    assert fn_STFT is not None
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = fbank.transpose(1, 2)
+    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
+    # print(fbank.shape, log_magnitudes_stft.shape)
+    if(target_length>0):
+        fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+            log_magnitudes_stft, target_length
+        )
+    return fbank, log_magnitudes_stft, waveform
+def uncapitalize(s):
+    if s:
+        return s[:1].lower() + s[1:]
+    else:
+        return ""
+def mix_wavs_and_captions(path1, path2, caption1, caption2, target_length=1024):
+    sound1 = read_wav_file(path1, target_length * 160)[0].numpy()
+    sound2 = read_wav_file(path2, target_length * 160)[0].numpy()
+    mixed_sound = mix(sound1, sound2, 0.5, 16000).reshape(1, -1)
+    mixed_caption = "{} and {}".format(caption1, uncapitalize(caption2))
+    return mixed_sound, mixed_caption
+def augment(paths, texts, num_items=4, target_length=1024):
+    mixed_sounds, mixed_captions = [], []
+    combinations = list(itertools.combinations(list(range(len(texts))), 2))
+    random.shuffle(combinations)
+    if len(combinations) < num_items:
+        selected_combinations = combinations
+    else:
+        selected_combinations = combinations[:num_items]
+    for (i, j) in selected_combinations:
+        new_sound, new_caption = mix_wavs_and_captions(paths[i], paths[j], texts[i], texts[j], target_length)
+        mixed_sounds.append(new_sound)
+        mixed_captions.append(new_caption)
+    waveform = torch.tensor(np.concatenate(mixed_sounds, 0))
+    waveform = waveform / torch.max(torch.abs(waveform))
+    waveform = 0.5 * waveform
+    return waveform, mixed_captions
+def augment_wav_to_fbank(paths, texts, num_items=4, target_length=1024, fn_STFT=None):
+    assert fn_STFT is not None
+    waveform, captions = augment(paths, texts)
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = fbank.transpose(1, 2)
+    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
+    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+        log_magnitudes_stft, target_length
+    )
     return fbank, log_magnitudes_stft, waveform, captions

codeclm/tokenizer/audio_tokenizer.py CHANGED Viewed

@@ -208,9 +208,9 @@ class Flow1dVAESeparate(AudioTokenizer):
         return codes_vocal, codes_bgm
     @torch.no_grad()
-    def decode(self, codes: torch.Tensor, prompt_vocal = None, prompt_bgm = None):
         wav = self.model.code2sound(codes, prompt_vocal=prompt_vocal, prompt_bgm=prompt_bgm, guidance_scale=1.5,
-                                    num_steps=50, disable_progress=False) # [B,N,T] -> [B,T]
         return wav[None]

         return codes_vocal, codes_bgm
     @torch.no_grad()
+    def decode(self, codes: torch.Tensor, prompt_vocal = None, prompt_bgm = None, chunked=False):
         wav = self.model.code2sound(codes, prompt_vocal=prompt_vocal, prompt_bgm=prompt_bgm, guidance_scale=1.5,
+                                    num_steps=50, disable_progress=False, chunked=chunked) # [B,N,T] -> [B,T]
         return wav[None]

generate_lowmem.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import sys
+import os
+import time
+import json
+import torch
+import torchaudio
+import numpy as np
+from omegaconf import OmegaConf
+from codeclm.models import builders
+from codeclm.trainer.codec_song_pl import CodecLM_PL
+from codeclm.models import CodecLM
+from third_party.demucs.models.pretrained import get_model_from_yaml
+auto_prompt_type = ['Pop', 'R&B', 'Dance', 'Jazz', 'Folk', 'Rock', 'Chinese Style', 'Chinese Tradition', 'Metal', 'Reggae', 'Chinese Opera', 'Auto']
+class Separator:
+    def __init__(self, dm_model_path='third_party/demucs/ckpt/htdemucs.pth', dm_config_path='third_party/demucs/ckpt/htdemucs.yaml', gpu_id=0) -> None:
+        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
+            self.device = torch.device(f"cuda:{gpu_id}")
+        else:
+            self.device = torch.device("cpu")
+        self.demucs_model = self.init_demucs_model(dm_model_path, dm_config_path)
+    def init_demucs_model(self, model_path, config_path):
+        model = get_model_from_yaml(config_path, model_path)
+        model.to(self.device)
+        model.eval()
+        return model
+    def load_audio(self, f):
+        a, fs = torchaudio.load(f)
+        if (fs != 48000):
+            a = torchaudio.functional.resample(a, fs, 48000)
+        if a.shape[-1] >= 48000*10:
+            a = a[..., :48000*10]
+        else:
+            a = torch.cat([a, a], -1)
+        return a[:, 0:48000*10]
+    def run(self, audio_path, output_dir='tmp', ext=".flac"):
+        os.makedirs(output_dir, exist_ok=True)
+        name, _ = os.path.splitext(os.path.split(audio_path)[-1])
+        output_paths = []
+        for stem in self.demucs_model.sources:
+            output_path = os.path.join(output_dir, f"{name}_{stem}{ext}")
+            if os.path.exists(output_path):
+                output_paths.append(output_path)
+        if len(output_paths) == 1:  # 4
+            vocal_path = output_paths[0]
+        else:
+            drums_path, bass_path, other_path, vocal_path = self.demucs_model.separate(audio_path, output_dir, device=self.device)
+            for path in [drums_path, bass_path, other_path]:
+                os.remove(path)
+        full_audio = self.load_audio(audio_path)
+        vocal_audio = self.load_audio(vocal_path)
+        bgm_audio = full_audio - vocal_audio
+        return full_audio, vocal_audio, bgm_audio
+if __name__ == "__main__":
+    torch.backends.cudnn.enabled = False
+    OmegaConf.register_new_resolver("eval", lambda x: eval(x))
+    OmegaConf.register_new_resolver("concat", lambda *x: [xxx for xx in x for xxx in xx])
+    OmegaConf.register_new_resolver("get_fname", lambda: os.path.splitext(os.path.basename(sys.argv[1]))[0])
+    OmegaConf.register_new_resolver("load_yaml", lambda x: list(OmegaConf.load(x)))
+    np.random.seed(int(time.time()))
+    ckpt_path = sys.argv[1]
+    input_jsonl = sys.argv[2]
+    save_dir = sys.argv[3]
+    cfg_path = os.path.join(ckpt_path, 'config.yaml')
+    ckpt_path = os.path.join(ckpt_path, 'model.pt')
+    cfg = OmegaConf.load(cfg_path)
+    cfg.mode = 'inference'
+    max_duration = cfg.max_dur
+    separator = Separator()
+    auto_prompt = torch.load('ckpt/prompt.pt')
+    audio_tokenizer = builders.get_audio_tokenizer_model(cfg.audio_tokenizer_checkpoint, cfg)
+    if "audio_tokenizer_checkpoint_sep" in cfg.keys():
+        seperate_tokenizer = builders.get_audio_tokenizer_model(cfg.audio_tokenizer_checkpoint_sep, cfg)
+    else:
+        seperate_tokenizer = None
+    audio_tokenizer = audio_tokenizer.eval().cuda()
+    if seperate_tokenizer is not None:
+        seperate_tokenizer = seperate_tokenizer.eval().cuda()
+    merge_prompt = [item for sublist in auto_prompt.values() for item in sublist]
+    with open(input_jsonl, "r") as fp:
+        lines = fp.readlines()
+    new_items = []
+    for line in lines:
+        item = json.loads(line)
+        target_wav_name = f"{save_dir}/audios/{item['idx']}.flac"
+        # get prompt audio
+        if "prompt_audio_path" in item:
+            assert os.path.exists(item['prompt_audio_path']), f"prompt_audio_path {item['prompt_audio_path']} not found"
+            assert 'auto_prompt_audio_type' not in item, f"auto_prompt_audio_type and prompt_audio_path cannot be used together"
+            pmt_wav, vocal_wav, bgm_wav = separator.run(item['prompt_audio_path'])
+            item['raw_pmt_wav'] = pmt_wav
+            item['raw_vocal_wav'] = vocal_wav
+            item['raw_bgm_wav'] = bgm_wav
+            if pmt_wav.dim() == 2:
+                pmt_wav = pmt_wav[None]
+            if pmt_wav.dim() != 3:
+                raise ValueError("Melody wavs should have a shape [B, C, T].")
+            pmt_wav = list(pmt_wav)
+            if vocal_wav.dim() == 2:
+                vocal_wav = vocal_wav[None]
+            if vocal_wav.dim() != 3:
+                raise ValueError("Vocal wavs should have a shape [B, C, T].")
+            vocal_wav = list(vocal_wav)
+            if bgm_wav.dim() == 2:
+                bgm_wav = bgm_wav[None]
+            if bgm_wav.dim() != 3:
+                raise ValueError("BGM wavs should have a shape [B, C, T].")
+            bgm_wav = list(bgm_wav)
+            if type(pmt_wav) == list:
+                pmt_wav = torch.stack(pmt_wav, dim=0)
+            if type(vocal_wav) == list:
+                vocal_wav = torch.stack(vocal_wav, dim=0)
+            if type(bgm_wav) == list:
+                bgm_wav = torch.stack(bgm_wav, dim=0)
+            pmt_wav = pmt_wav.cuda()
+            vocal_wav = vocal_wav.cuda()
+            bgm_wav = bgm_wav.cuda()
+            pmt_wav, _ = audio_tokenizer.encode(pmt_wav)
+            vocal_wav, bgm_wav = seperate_tokenizer.encode(vocal_wav, bgm_wav)
+            melody_is_wav = False
+        elif "auto_prompt_audio_type" in item:
+            assert item["auto_prompt_audio_type"] in auto_prompt_type, f"auto_prompt_audio_type {item['auto_prompt_audio_type']} not found"
+            if item["auto_prompt_audio_type"] == "Auto":
+                prompt_token = merge_prompt[np.random.randint(0, len(merge_prompt))]
+            else:
+                prompt_token = auto_prompt[item["auto_prompt_audio_type"]][np.random.randint(0, len(auto_prompt[item["auto_prompt_audio_type"]]))]
+            pmt_wav = prompt_token[:,[0],:]
+            vocal_wav = prompt_token[:,[1],:]
+            bgm_wav = prompt_token[:,[2],:]
+            melody_is_wav = False
+        else:
+            pmt_wav = None
+            vocal_wav = None
+            bgm_wav = None
+            melody_is_wav = True
+        item['pmt_wav'] = pmt_wav
+        item['vocal_wav'] = vocal_wav
+        item['bgm_wav'] = bgm_wav
+        item['melody_is_wav'] = melody_is_wav
+        item["idx"] = f"{item['idx']}"
+        item["wav_path"] = target_wav_name
+        new_items.append(item)
+    del audio_tokenizer
+    del seperate_tokenizer
+    del separator
+    # Define model or load pretrained model
+    model_light = CodecLM_PL(cfg, ckpt_path)
+    model_light = model_light.eval()
+    model_light.audiolm.cfg = cfg
+    model = CodecLM(name = "tmp",
+        lm = model_light.audiolm,
+        audiotokenizer = None,
+        max_duration = max_duration,
+        seperate_tokenizer = None,
+    )
+    del model_light
+    model.lm = model.lm.cuda().to(torch.float16)
+    cfg_coef = 1.5 #25
+    temp = 0.9
+    top_k = 50
+    top_p = 0.0
+    record_tokens = True
+    record_window = 50
+    model.set_generation_params(duration=max_duration, extend_stride=5, temperature=temp, cfg_coef=cfg_coef,
+                                top_k=top_k, top_p=top_p, record_tokens=record_tokens, record_window=record_window)
+    os.makedirs(save_dir, exist_ok=True)
+    os.makedirs(save_dir + "/audios", exist_ok=True)
+    os.makedirs(save_dir + "/jsonl", exist_ok=True)
+    for item in new_items:
+        lyric = item["gt_lyric"]
+        descriptions = item["descriptions"] if "descriptions" in item else None
+        pmt_wav = item['pmt_wav']
+        vocal_wav = item['vocal_wav']
+        bgm_wav = item['bgm_wav']
+        melody_is_wav = item['melody_is_wav']
+        generate_inp = {
+            'lyrics': [lyric.replace("  ", " ")],
+            'descriptions': [descriptions],
+            'melody_wavs': pmt_wav,
+            'vocal_wavs': vocal_wav,
+            'bgm_wavs': bgm_wav,
+            'melody_is_wav': melody_is_wav,
+        }
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            tokens = model.generate(**generate_inp, return_tokens=True)
+        item['tokens'] = tokens
+    del model
+    torch.cuda.empty_cache()
+    seperate_tokenizer = builders.get_audio_tokenizer_model(cfg.audio_tokenizer_checkpoint_sep, cfg)
+    seperate_tokenizer = seperate_tokenizer.eval().cuda()
+    model = CodecLM(name = "tmp",
+        lm = None,
+        audiotokenizer = None,
+        max_duration = max_duration,
+        seperate_tokenizer = seperate_tokenizer,
+    )
+    for item in new_items:
+        with torch.no_grad():
+            if 'raw_pmt_wav' in item:
+                wav_seperate = model.generate_audio(item['tokens'], item['raw_pmt_wav'], item['raw_vocal_wav'], item['raw_bgm_wav'], chunked=True)
+                del item['raw_pmt_wav']
+                del item['raw_vocal_wav']
+                del item['raw_bgm_wav']
+            else:
+                wav_seperate = model.generate_audio(item['tokens'], chunked=True)
+        torchaudio.save(item['wav_path'], wav_seperate[0].cpu().float(), cfg.sample_rate)
+        del item['tokens']
+        del item['pmt_wav']
+        del item['vocal_wav']
+        del item['bgm_wav']
+        del item['melody_is_wav']
+    torch.cuda.empty_cache()
+    src_jsonl_name = os.path.split(input_jsonl)[-1]
+    with open(f"{save_dir}/jsonl/{src_jsonl_name}.jsonl", "w", encoding='utf-8') as fw:
+        for item in new_items:
+            fw.writelines(json.dumps(item, ensure_ascii=False)+"\n")

generate_lowmem.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+export USER=root
+export PYTHONDONTWRITEBYTECODE=1
+export TRANSFORMERS_CACHE="$(pwd)/third_party/hub"
+export NCCL_HOME=/usr/local/tccl
+export PYTHONPATH="$(pwd)/codeclm/tokenizer/":"$(pwd)":"$(pwd)/codeclm/tokenizer/Flow1dVAE/":"$(pwd)/codeclm/tokenizer/":$PYTHONPATH
+CKPT_PATH=$1
+JSONL=$2
+SAVE_DIR=$3
+python3 generate_lowmem.py $CKPT_PATH $JSONL $SAVE_DIR

requirements.txt CHANGED Viewed

	@@ -0,0 +1,24 @@

+alias-free-torch>=0.0.6
+descript-audio-codec>=1.0.0
+diffusers==0.27.2
+einops>=0.8.1
+einops-exts==0.0.4
+flashy>=0.0.2
+huggingface-hub==0.25.2
+julius>=0.2.7
+k-diffusion==0.1.1
+kaldiio>=2.18.1
+lameenc>=1.8.1
+librosa>=0.11.0
+lightning>=2.5.2
+ninja>=1.11.1.4
+nnAudio>=0.3.3
+openunmix>=1.3.0
+peft==0.10.0
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+transformers==4.37.2
+vector-quantize-pytorch>=1.22.17
+wheel>=0.45.1
+x-transformers>=2.3.25

requirements_nodeps.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fairseq==0.12.2
+antlr4-python3-runtime==4.8
+bitarray==3.4.3
+cffi==1.17.1
+colorama==0.4.6
+cython==3.1.2
+hydra-core==1.0.7
+lxml==5.4.0
+omegaconf==2.2.0
+portalocker==3.2.0
+pycparser==2.22
+sacrebleu==2.5.1
+tabulate==0.9.0

sample/lyrics.jsonl CHANGED Viewed

@@ -1,4 +1,4 @@
 {"idx": "sample_01_autoprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "auto_prompt_audio_type": "Auto"}
 {"idx": "sample_01_noprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
 {"idx": "sample_01_textprompt", "descriptions": "female, dark, pop, sad, piano and drums, the bpm is 125.", "gt_lyric": "[intro-short] ;  [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
-{"idx": "sample_01_audioprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "prompt_audio_path": "sample/sample_prompt_audio.wav"}

 {"idx": "sample_01_autoprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "auto_prompt_audio_type": "Auto"}
 {"idx": "sample_01_noprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
 {"idx": "sample_01_textprompt", "descriptions": "female, dark, pop, sad, piano and drums, the bpm is 125.", "gt_lyric": "[intro-short] ;  [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
+{"idx": "sample_01_audioprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "prompt_audio_path": "input/sample_prompt_audio.wav"}

tools/gradio/app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import sys
+import gradio as gr
+import json
+from datetime import datetime
+import yaml
+import time
+import re
+import os.path as op
+from levo_inference_lowmem import LeVoInference
+EXAMPLE_LYRICS = """
+[intro-short]
+[verse]
+夜晚的街灯闪烁
+我漫步在熟悉的角落
+回忆像潮水般涌来
+你的笑容如此清晰
+在心头无法抹去
+那些曾经的甜蜜
+如今只剩我独自回忆
+[verse]
+手机屏幕亮起
+是你发来的消息
+简单的几个字
+却让我泪流满面
+曾经的拥抱温暖
+如今却变得遥远
+我多想回到从前
+重新拥有你的陪伴
+[chorus]
+回忆的温度还在
+你却已不在
+我的心被爱填满
+却又被思念刺痛
+音乐的节奏奏响
+我的心却在流浪
+没有你的日子
+我该如何继续向前
+[outro-short]
+""".strip()
+APP_DIR = op.dirname(op.dirname(op.dirname(op.abspath(__file__))))
+MODEL = LeVoInference(sys.argv[1])
+with open(op.join(APP_DIR, 'conf/vocab.yaml'), 'r', encoding='utf-8') as file:
+    STRUCTS = yaml.safe_load(file)
+def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, progress=gr.Progress(track_tqdm=True)):
+    global MODEL
+    global STRUCTS
+    params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
+    params = {k:v for k,v in params.items() if v is not None}
+    vocal_structs = ['[verse]', '[chorus]', '[bridge]']
+    sample_rate = MODEL.cfg.sample_rate
+    # format lyric
+    lyric = lyric.replace("[intro]", "[intro-short]").replace("[inst]", "[inst-short]").replace("[outro]", "[outro-short]")
+    paragraphs = [p.strip() for p in lyric.strip().split('\n\n') if p.strip()]
+    if len(paragraphs) < 1:
+        return None, json.dumps("Lyrics can not be left blank")
+    paragraphs_norm = []
+    vocal_flag = False
+    for para in paragraphs:
+        lines = para.splitlines()
+        struct_tag = lines[0].strip().lower()
+        if struct_tag not in STRUCTS:
+            return None, json.dumps(f"Segments should start with a structure tag in {STRUCTS}")
+        if struct_tag in vocal_structs:
+            vocal_flag = True
+            if len(lines) < 2 or not [line.strip() for line in lines[1:] if line.strip()]:
+                return None, json.dumps("The following segments require lyrics: [verse], [chorus], [bridge]")
+            else:
+                new_para_list = []
+                for line in lines[1:]:
+                    new_para_list.append(re.sub(r"[^\w\s\[\]\-\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u00c0-\u017f]", "", line))
+                new_para_str = f"{struct_tag} {'.'.join(new_para_list)}"
+        else:
+            if len(lines) > 1:
+                return None, json.dumps("The following segments should not contain lyrics: [intro], [intro-short], [intro-medium], [inst], [inst-short], [inst-medium], [outro], [outro-short], [outro-medium]")
+            else:
+                new_para_str = struct_tag
+        paragraphs_norm.append(new_para_str)
+    if not vocal_flag:
+        return None, json.dumps(f"The lyric must contain at least one of the following structures: {vocal_structs}")
+    lyric_norm = " ; ".join(paragraphs_norm)
+    # format prompt
+    if prompt_audio is not None:
+        genre = None
+        description = None
+    elif description is not None and description != "":
+        genre = None
+    progress(0.0, "Start Generation")
+    start = time.time()
+    audio_data = MODEL(lyric_norm, description, prompt_audio, genre, op.join(APP_DIR, "ckpt/prompt.pt"), params).cpu().permute(1, 0).float().numpy()
+    end = time.time()
+    # 创建输入配置的JSON
+    input_config = {
+        "lyric": lyric_norm,
+        "genre": genre,
+        "prompt_audio": prompt_audio,
+        "description": description,
+        "params": params,
+        "inference_duration": end - start,
+        "timestamp": datetime.now().isoformat(),
+    }
+    return (sample_rate, audio_data), json.dumps(input_config, indent=2)
+# 创建Gradio界面
+with gr.Blocks(title="SongGeneration Demo Space") as demo:
+    gr.Markdown("# 🎵 SongGeneration Demo Space")
+    gr.Markdown("Demo interface for the song generation model. Provide a lyrics, and optionally an audio or text prompt, to generate a custom song.")
+    with gr.Row():
+        with gr.Column():
+            lyric = gr.Textbox(
+                label="Lyrics",
+                lines=5,
+                max_lines=15,
+                value=EXAMPLE_LYRICS,
+                info="Each paragraph represents a segment starting with a structure tag and ending with a blank line, each line is a sentence without punctuation, segments [intro], [inst], [outro] should not contain lyrics, while [verse], [chorus], and [bridge] require lyrics.",
+                placeholder="""Lyric Format
+'''
+[structure tag]
+lyrics
+[structure tag]
+lyrics
+'''
+1. One paragraph represents one segments, starting with a structure tag and ending with a blank line
+2. One line represents one sentence, punctuation is not recommended inside the sentence
+3. The following segments should not contain lyrics: [intro-short], [intro-medium], [inst-short], [inst-medium], [outro-short], [outro-medium]
+4. The following segments require lyrics: [verse], [chorus], [bridge]
+"""
+            )
+            with gr.Tabs(elem_id="extra-tabs"):
+                with gr.Tab("Genre Select"):
+                    genre = gr.Radio(
+                        choices=["Pop", "R&B", "Dance", "Jazz", "Folk", "Rock", "Chinese Style", "Chinese Tradition", "Metal", "Reggae", "Chinese Opera", "Auto"],
+                        label="Genre Select(Optional)",
+                        value="Pop",
+                        interactive=True,
+                        elem_id="single-select-radio"
+                    )
+                with gr.Tab("Audio Prompt"):
+                    prompt_audio = gr.Audio(
+                        label="Prompt Audio (Optional)",
+                        type="filepath",
+                        elem_id="audio-prompt"
+                    )
+                with gr.Tab("Text Prompt"):
+                    gr.Markdown("For detailed usage, please refer to [here](https://github.com/tencent-ailab/SongGeneration?tab=readme-ov-file#-description-input-format)")
+                    description = gr.Textbox(
+                        label="Song Description (Optional)",
+                        info="Describe the gender, timbre, genre, emotion, instrument and bpm of the song. Only English is supported currently.",
+                        placeholder="female, dark, pop, sad, piano and drums, the bpm is 125.",
+                        lines=1,
+                        max_lines=2
+                    )
+            with gr.Accordion("Advanced Config", open=False):
+                cfg_coef = gr.Slider(
+                    label="CFG Coefficient",
+                    minimum=0.1,
+                    maximum=3.0,
+                    step=0.1,
+                    value=1.5,
+                    interactive=True,
+                    elem_id="cfg-coef",
+                )
+                temperature = gr.Slider(
+                    label="Temperature",
+                    minimum=0.1,
+                    maximum=2.0,
+                    step=0.1,
+                    value=0.9,
+                    interactive=True,
+                    elem_id="temperature",
+                )
+                top_k = gr.Slider(
+                    label="Top-K",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=50,
+                    interactive=True,
+                    elem_id="top_k",
+                )
+            generate_btn = gr.Button("Generate Song", variant="primary")
+        with gr.Column():
+            output_audio = gr.Audio(label="Generated Song", type="numpy")
+            output_json = gr.JSON(label="Generated Info")
+        # # 示例按钮
+        # examples = gr.Examples(
+        #     examples=[
+        #         ["male, bright, rock, happy, electric guitar and drums, the bpm is 150."],
+        #         ["female, warm, jazz, romantic, synthesizer and piano, the bpm is 100."]
+        #     ],
+        #     inputs=[description],
+        #     label="Text Prompt examples"
+        # )
+        # examples = gr.Examples(
+        #     examples=[
+        #     "[intro-medium]\n\n[verse]\n在这个疯狂的世界里\n谁不渴望一点改变\n在爱情面前\n我们都显得那么不安全\n你紧紧抱着我\n告诉我再靠近一点\n别让这璀璨的夜晚白白浪费\n我那迷茫的眼睛\n看不见未来的路\n在情感消散之前\n我们对爱的渴望永不熄灭\n你给我留下一句誓言\n想知道我们的爱是否能持续到永远\n[chorus]\n\n约定在那最后的夜晚\n不管命运如何摆布\n我们的心是否依然如初\n我会穿上红衬衫\n带着摇滚的激情\n回到我们初遇的地方\n约定在那最后的夜晚\n就算全世界都变了样\n我依然坚守诺言\n铭记这一天\n你永远是我心中的爱恋\n\n[outro-medium]\n",
+        #     "[intro-short]\n\n[verse]\nThrough emerald canyons where fireflies dwell\nCerulean berries kiss morning's first swell\nCrystalline dew crowns each Vitamin Dawn's confection dissolves slowly on me\nAmbrosia breezes through honeycomb vines\nNature's own candy in Fibonacci lines\n[chorus] Blueberry fruit so sweet\n takes you higher\n can't be beat\n In your lungs\n it starts to swell\n You're under its spell\n [verse] Resin of sunlight in candied retreat\nMarmalade moonbeams melt under bare feet\nNectar spirals bloom chloroplast champagne\nPhotosynthesis sings through my veins\nChlorophyll rhythms pulse warm in my blood\nThe forest's green pharmacy floods every bud[chorus] Blueberry fruit so sweet\n takes you higher\n can't be beat\n In your lungs\n it starts to swell\n You're under its spell\n feel the buzz\n ride the wave\n Limey me\n blueberry\n your mind's enslaved\n In the haze\n lose all time\n floating free\n feeling fine\n Blueberry\n fruit so sweet\n takes you higher\n can't be beat\n In your lungs\n it starts to swell\n cry\n You're under its spell\n\n[outro-short]\n",
+        #     ],
+        #     inputs=[lyric],
+        #     label="Lyrics examples",
+        # )
+    # 生成按钮点击事件
+    generate_btn.click(
+        fn=generate_song,
+        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
+        outputs=[output_audio, output_json]
+    )
+# 启动应用
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=8081)

tools/gradio/levo_inference.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import sys
+import torch
+import json
+import numpy as np
+from omegaconf import OmegaConf
+from codeclm.trainer.codec_song_pl import CodecLM_PL
+from codeclm.models import CodecLM
+from separator import Separator
+class LeVoInference(torch.nn.Module):
+    def __init__(self, ckpt_path):
+        super().__init__()
+        torch.backends.cudnn.enabled = False
+        OmegaConf.register_new_resolver("eval", lambda x: eval(x))
+        OmegaConf.register_new_resolver("concat", lambda *x: [xxx for xx in x for xxx in xx])
+        OmegaConf.register_new_resolver("get_fname", lambda: 'default')
+        OmegaConf.register_new_resolver("load_yaml", lambda x: list(OmegaConf.load(x)))
+        cfg_path = os.path.join(ckpt_path, 'config.yaml')
+        pt_path = os.path.join(ckpt_path, 'model.pt')
+        self.cfg = OmegaConf.load(cfg_path)
+        self.cfg.mode = 'inference'
+        self.max_duration = self.cfg.max_dur
+        # Define model or load pretrained model
+        model_light = CodecLM_PL(self.cfg, pt_path)
+        model_light = model_light.eval().cuda()
+        model_light.audiolm.cfg = self.cfg
+        self.model_lm = model_light.audiolm
+        self.model_audio_tokenizer = model_light.audio_tokenizer
+        self.model_seperate_tokenizer = model_light.seperate_tokenizer
+        self.model = CodecLM(name = "tmp",
+            lm = self.model_lm,
+            audiotokenizer = self.model_audio_tokenizer,
+            max_duration = self.max_duration,
+            seperate_tokenizer = self.model_seperate_tokenizer,
+        )
+        self.separator = Separator()
+        self.default_params = dict(
+            cfg_coef = 1.5,
+            temperature = 1.0,
+            top_k = 50,
+            top_p = 0.0,
+            record_tokens = True,
+            record_window = 50,
+            extend_stride = 5,
+            duration = self.max_duration,
+        )
+        self.model.set_generation_params(**self.default_params)
+    def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, params = dict()):
+        params = {**self.default_params, **params}
+        self.model.set_generation_params(**params)
+        if prompt_audio_path is not None and os.path.exists(prompt_audio_path):
+            pmt_wav, vocal_wav, bgm_wav = self.separator.run(prompt_audio_path)
+            melody_is_wav = True
+        elif genre is not None and auto_prompt_path is not None:
+            auto_prompt = torch.load(auto_prompt_path)
+            merge_prompt = [item for sublist in auto_prompt.values() for item in sublist]
+            if genre == "Auto":
+                prompt_token = merge_prompt[np.random.randint(0, len(merge_prompt))]
+            else:
+                prompt_token = auto_prompt[genre][np.random.randint(0, len(auto_prompt[genre]))]
+            pmt_wav = prompt_token[:,[0],:]
+            vocal_wav = prompt_token[:,[1],:]
+            bgm_wav = prompt_token[:,[2],:]
+            melody_is_wav = False
+        else:
+            pmt_wav = None
+            vocal_wav = None
+            bgm_wav = None
+            melody_is_wav = True
+        generate_inp = {
+            'lyrics': [lyric.replace("  ", " ")],
+            'descriptions': [description],
+            'melody_wavs': pmt_wav,
+            'vocal_wavs': vocal_wav,
+            'bgm_wavs': bgm_wav,
+            'melody_is_wav': melody_is_wav,
+        }
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            tokens = self.model.generate(**generate_inp, return_tokens=True)
+        if tokens.shape[-1] > 3000:
+            tokens = tokens[..., :3000]
+        with torch.no_grad():
+            if melody_is_wav:
+                wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
+            else:
+                wav_seperate = self.model.generate_audio(tokens)
+        return wav_seperate[0]

tools/gradio/levo_inference_lowmem.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import sys
+import torch
+import json
+import numpy as np
+from omegaconf import OmegaConf
+from codeclm.trainer.codec_song_pl import CodecLM_PL
+from codeclm.models import CodecLM
+from codeclm.models import builders
+from separator import Separator
+class LeVoInference(torch.nn.Module):
+    def __init__(self, ckpt_path):
+        super().__init__()
+        torch.backends.cudnn.enabled = False
+        OmegaConf.register_new_resolver("eval", lambda x: eval(x))
+        OmegaConf.register_new_resolver("concat", lambda *x: [xxx for xx in x for xxx in xx])
+        OmegaConf.register_new_resolver("get_fname", lambda: 'default')
+        OmegaConf.register_new_resolver("load_yaml", lambda x: list(OmegaConf.load(x)))
+        cfg_path = os.path.join(ckpt_path, 'config.yaml')
+        self.pt_path = os.path.join(ckpt_path, 'model.pt')
+        self.cfg = OmegaConf.load(cfg_path)
+        self.cfg.mode = 'inference'
+        self.max_duration = self.cfg.max_dur
+        self.default_params = dict(
+            top_p = 0.0,
+            record_tokens = True,
+            record_window = 50,
+            extend_stride = 5,
+            duration = self.max_duration,
+        )
+    def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, params = dict()):
+        if prompt_audio_path is not None and os.path.exists(prompt_audio_path):
+            separator = Separator()
+            audio_tokenizer = builders.get_audio_tokenizer_model(self.cfg.audio_tokenizer_checkpoint, self.cfg)
+            audio_tokenizer = audio_tokenizer.eval().cuda()
+            seperate_tokenizer = builders.get_audio_tokenizer_model(self.cfg.audio_tokenizer_checkpoint_sep, self.cfg)
+            seperate_tokenizer = seperate_tokenizer.eval().cuda()
+            pmt_wav, vocal_wav, bgm_wav = separator.run(prompt_audio_path)
+            pmt_wav = pmt_wav.cuda()
+            vocal_wav = vocal_wav.cuda()
+            bgm_wav = bgm_wav.cuda()
+            pmt_wav, _ = audio_tokenizer.encode(pmt_wav)
+            vocal_wav, bgm_wav = seperate_tokenizer.encode(vocal_wav, bgm_wav)
+            melody_is_wav = False
+            melody_is_wav = False
+            del audio_tokenizer
+            del seperate_tokenizer
+            del separator
+        elif genre is not None and auto_prompt_path is not None:
+            auto_prompt = torch.load(auto_prompt_path)
+            merge_prompt = [item for sublist in auto_prompt.values() for item in sublist]
+            if genre == "Auto":
+                prompt_token = merge_prompt[np.random.randint(0, len(merge_prompt))]
+            else:
+                prompt_token = auto_prompt[genre][np.random.randint(0, len(auto_prompt[genre]))]
+            pmt_wav = prompt_token[:,[0],:]
+            vocal_wav = prompt_token[:,[1],:]
+            bgm_wav = prompt_token[:,[2],:]
+            melody_is_wav = False
+        else:
+            pmt_wav = None
+            vocal_wav = None
+            bgm_wav = None
+            melody_is_wav = True
+        model_light = CodecLM_PL(self.cfg, self.pt_path)
+        model_light = model_light.eval()
+        model_light.audiolm.cfg = self.cfg
+        model = CodecLM(name = "tmp",
+            lm = model_light.audiolm,
+            audiotokenizer = None,
+            max_duration = self.max_duration,
+            seperate_tokenizer = None,
+        )
+        del model_light
+        model.lm = model.lm.cuda().to(torch.float16)
+        params = {**self.default_params, **params}
+        model.set_generation_params(**params)
+        generate_inp = {
+            'lyrics': [lyric.replace("  ", " ")],
+            'descriptions': [description],
+            'melody_wavs': pmt_wav,
+            'vocal_wavs': vocal_wav,
+            'bgm_wavs': bgm_wav,
+            'melody_is_wav': melody_is_wav,
+        }
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            tokens = model.generate(**generate_inp, return_tokens=True)
+        del model
+        torch.cuda.empty_cache()
+        seperate_tokenizer = builders.get_audio_tokenizer_model(self.cfg.audio_tokenizer_checkpoint_sep, self.cfg)
+        seperate_tokenizer = seperate_tokenizer.eval().cuda()
+        model = CodecLM(name = "tmp",
+            lm = None,
+            audiotokenizer = None,
+            max_duration = self.max_duration,
+            seperate_tokenizer = seperate_tokenizer,
+        )
+        if tokens.shape[-1] > 3000:
+            tokens = tokens[..., :3000]
+        with torch.no_grad():
+            if melody_is_wav:
+                wav_seperate = model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
+            else:
+                wav_seperate = model.generate_audio(tokens)
+        del seperate_tokenizer
+        del model
+        torch.cuda.empty_cache()
+        return wav_seperate[0]

tools/gradio/run.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+export USER=root
+export PYTHONDONTWRITEBYTECODE=1
+export TRANSFORMERS_CACHE="$(pwd)/third_party/hub"
+export NCCL_HOME=/usr/local/tccl
+export PYTHONPATH="$(pwd)/codeclm/tokenizer/":"$(pwd)":"$(pwd)/codeclm/tokenizer/Flow1dVAE/":"$(pwd)/codeclm/tokenizer/":$PYTHONPATH
+CKPT_PATH=$1
+python3 tools/gradio/app.py $CKPT_PATH

tools/gradio/separator.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torchaudio
+import os
+import torch
+from third_party.demucs.models.pretrained import get_model_from_yaml
+class Separator(torch.nn.Module):
+    def __init__(self, dm_model_path='third_party/demucs/ckpt/htdemucs.pth', dm_config_path='third_party/demucs/ckpt/htdemucs.yaml', gpu_id=0) -> None:
+        super().__init__()
+        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
+            self.device = torch.device(f"cuda:{gpu_id}")
+        else:
+            self.device = torch.device("cpu")
+        self.demucs_model = self.init_demucs_model(dm_model_path, dm_config_path)
+    def init_demucs_model(self, model_path, config_path):
+        model = get_model_from_yaml(config_path, model_path)
+        model.to(self.device)
+        model.eval()
+        return model
+    def load_audio(self, f):
+        a, fs = torchaudio.load(f)
+        if (fs != 48000):
+            a = torchaudio.functional.resample(a, fs, 48000)
+        if a.shape[-1] >= 48000*10:
+            a = a[..., :48000*10]
+        else:
+            a = torch.cat([a, a], -1)
+        return a[:, 0:48000*10]
+    def run(self, audio_path, output_dir='tmp', ext=".flac"):
+        os.makedirs(output_dir, exist_ok=True)
+        name, _ = os.path.splitext(os.path.split(audio_path)[-1])
+        output_paths = []
+        for stem in self.demucs_model.sources:
+            output_path = os.path.join(output_dir, f"{name}_{stem}{ext}")
+            if os.path.exists(output_path):
+                output_paths.append(output_path)
+        if len(output_paths) == 1:  # 4
+            vocal_path = output_paths[0]
+        else:
+            drums_path, bass_path, other_path, vocal_path = self.demucs_model.separate(audio_path, output_dir, device=self.device)
+            for path in [drums_path, bass_path, other_path]:
+                os.remove(path)
+        full_audio = self.load_audio(audio_path)
+        vocal_audio = self.load_audio(vocal_path)
+        bgm_audio = full_audio - vocal_audio
+        return full_audio, vocal_audio, bgm_audio