mooki0 commited on
Commit
57276d4
·
verified ·
1 Parent(s): d756e9e

Initial commit of Gradio app

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +14 -0
  2. .gitignore +3 -0
  3. LICENSE +81 -0
  4. Miniconda3-latest-MacOSX-arm64.sh +3 -0
  5. README.md +232 -12
  6. README_zh_cn.md +224 -0
  7. app.py +245 -0
  8. assets/application.png +3 -0
  9. assets/arch.jpg +3 -0
  10. assets/panorama1.gif +3 -0
  11. assets/panorama2.gif +3 -0
  12. assets/qrcode/discord.png +0 -0
  13. assets/qrcode/wechat.png +0 -0
  14. assets/qrcode/x.png +0 -0
  15. assets/qrcode/xiaohongshu.png +0 -0
  16. assets/quick_look.gif +3 -0
  17. assets/roaming_world.gif +3 -0
  18. assets/teaser.png +3 -0
  19. demo_panogen.py +223 -0
  20. demo_scenegen.py +120 -0
  21. docker/HunyuanWorld.osx-cpu.yaml +142 -0
  22. docker/HunyuanWorld.osx64.yaml +247 -0
  23. docker/HunyuanWorld.yaml +246 -0
  24. docker/HunyuanWorld_mac.yaml +186 -0
  25. examples/case1/classes.txt +1 -0
  26. examples/case1/input.png +3 -0
  27. examples/case2/classes.txt +1 -0
  28. examples/case2/input.png +3 -0
  29. examples/case2/labels_fg1.txt +1 -0
  30. examples/case2/labels_fg2.txt +1 -0
  31. examples/case3/classes.txt +1 -0
  32. examples/case3/input.png +3 -0
  33. examples/case4/classes.txt +1 -0
  34. examples/case4/prompt.txt +1 -0
  35. examples/case5/classes.txt +1 -0
  36. examples/case5/input.png +3 -0
  37. examples/case6/classes.txt +1 -0
  38. examples/case6/input.png +3 -0
  39. examples/case6/labels_fg1.txt +1 -0
  40. examples/case7/classes.txt +1 -0
  41. examples/case7/prompt.txt +1 -0
  42. examples/case8/classes.txt +1 -0
  43. examples/case8/input.png +3 -0
  44. examples/case9/classes.txt +1 -0
  45. examples/case9/prompt.txt +1 -0
  46. hy3dworld/__init__.py +22 -0
  47. hy3dworld/models/__init__.py +29 -0
  48. hy3dworld/models/adaptive_depth_compression.py +474 -0
  49. hy3dworld/models/layer_decomposer.py +155 -0
  50. hy3dworld/models/pano_generator.py +236 -0
.gitattributes CHANGED
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Miniconda3-latest-MacOSX-arm64.sh filter=lfs diff=lfs merge=lfs -text
37
+ assets/application.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/arch.jpg filter=lfs diff=lfs merge=lfs -text
39
+ assets/panorama1.gif filter=lfs diff=lfs merge=lfs -text
40
+ assets/panorama2.gif filter=lfs diff=lfs merge=lfs -text
41
+ assets/quick_look.gif filter=lfs diff=lfs merge=lfs -text
42
+ assets/roaming_world.gif filter=lfs diff=lfs merge=lfs -text
43
+ assets/teaser.png filter=lfs diff=lfs merge=lfs -text
44
+ examples/case1/input.png filter=lfs diff=lfs merge=lfs -text
45
+ examples/case2/input.png filter=lfs diff=lfs merge=lfs -text
46
+ examples/case3/input.png filter=lfs diff=lfs merge=lfs -text
47
+ examples/case5/input.png filter=lfs diff=lfs merge=lfs -text
48
+ examples/case6/input.png filter=lfs diff=lfs merge=lfs -text
49
+ examples/case8/input.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Ignore Python bytecode files
2
+ __pycache__/
3
+ *.pyc
LICENSE ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ Tencent HunyuanWorld-1.0 Release Date: July 27, 2025
3
+ THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
5
+ 1. DEFINITIONS.
6
+ a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
7
+ b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent HunyuanWorld-1.0 Works or any portion or element thereof set forth herein.
8
+ c. “Documentation” shall mean the specifications, manuals and documentation for Tencent HunyuanWorld-1.0 made publicly available by Tencent.
9
+ d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
10
+ e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent HunyuanWorld-1.0 Works for any purpose and in any field of use.
11
+ f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent HunyuanWorld-1.0 and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
12
+ g. “Model Derivatives” shall mean all: (i) modifications to Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0; (ii) works based on Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0, to that model in order to cause that model to perform similarly to Tencent HunyuanWorld-1.0 or a Model Derivative of Tencent HunyuanWorld-1.0, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent HunyuanWorld-1.0 or a Model Derivative of Tencent HunyuanWorld-1.0 for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
13
+ h. “Output” shall mean the information and/or content output of Tencent HunyuanWorld-1.0 or a Model Derivative that results from operating or otherwise using Tencent HunyuanWorld-1.0 or a Model Derivative, including via a Hosted Service.
14
+ i. “Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials..
15
+ j. “Tencent HunyuanWorld-1.0” shall mean the 3D generation models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
16
+ k. “Tencent HunyuanWorld-1.0 Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
17
+ l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
18
+ m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
19
+ n. “including” shall mean including but not limited to.
20
+ 2. GRANT OF RIGHTS.
21
+ We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
22
+ 3. DISTRIBUTION.
23
+ You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent HunyuanWorld-1.0 Works, exclusively in the Territory, provided that You meet all of the following conditions:
24
+ a. You must provide all such Third Party recipients of the Tencent HunyuanWorld-1.0 Works or products or services using them a copy of this Agreement;
25
+ b. You must cause any modified files to carry prominent notices stating that You changed the files;
26
+ c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent HunyuanWorld-1.0 Works; and (ii) mark the products or services developed by using the Tencent HunyuanWorld-1.0 Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
27
+ d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent HunyuanWorld-1.0 is licensed under the Tencent HunyuanWorld-1.0 Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
28
+ You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent HunyuanWorld-1.0 Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
29
+ 4. ADDITIONAL COMMERCIAL TERMS.
30
+ If, on the Tencent HunyuanWorld-1.0 version release date, the monthly active users of all products or services made available by or for Licensee is greater than 1 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
31
+ Subject to Tencent's written approval, you may request a license for the use of Tencent HunyuanWorld-1.0 by submitting the following information to [email protected]:
32
+ a. Your company’s name and associated business sector that plans to use Tencent HunyuanWorld-1.0.
33
+ b. Your intended use case and the purpose of using Tencent HunyuanWorld-1.0.
34
+ c. Your plans to modify Tencent HunyuanWorld-1.0 or create Model Derivatives.
35
+ 5. RULES OF USE.
36
+ a. Your use of the Tencent HunyuanWorld-1.0 Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent HunyuanWorld-1.0 Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent HunyuanWorld-1.0 Works and You must provide notice to subsequent users to whom You distribute that Tencent HunyuanWorld-1.0 Works are subject to the use restrictions in these Sections 5(a) and 5(b).
37
+ b. You must not use the Tencent HunyuanWorld-1.0 Works or any Output or results of the Tencent HunyuanWorld-1.0 Works to improve any other AI model (other than Tencent HunyuanWorld-1.0 or Model Derivatives thereof).
38
+ c. You must not use, reproduce, modify, distribute, or display the Tencent HunyuanWorld-1.0 Works, Output or results of the Tencent HunyuanWorld-1.0 Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
39
+ 6. INTELLECTUAL PROPERTY.
40
+ a. Subject to Tencent’s ownership of Tencent HunyuanWorld-1.0 Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
41
+ b. No trademark licenses are granted under this Agreement, and in connection with the Tencent HunyuanWorld-1.0 Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent HunyuanWorld-1.0 Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
42
+ c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent HunyuanWorld-1.0 Works.
43
+ d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
44
+ 7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
45
+ a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent HunyuanWorld-1.0 Works or to grant any license thereto.
46
+ b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUANWORLD-1.0 WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
47
+ c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
48
+ 8. SURVIVAL AND TERMINATION.
49
+ a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
50
+ b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent HunyuanWorld-1.0 Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
51
+ 9. GOVERNING LAW AND JURISDICTION.
52
+ a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
53
+ b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
54
+
55
+ EXHIBIT A
56
+ ACCEPTABLE USE POLICY
57
+
58
+ Tencent reserves the right to update this Acceptable Use Policy from time to time.
59
+ Last modified: November 5, 2024
60
+
61
+ Tencent endeavors to promote safe and fair use of its tools and features, including Tencent HunyuanWorld-1.0. You agree not to use Tencent HunyuanWorld-1.0 or Model Derivatives:
62
+ 1. Outside the Territory;
63
+ 2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
64
+ 3. To harm Yourself or others;
65
+ 4. To repurpose or distribute output from Tencent HunyuanWorld-1.0 or any Model Derivatives to harm Yourself or others;
66
+ 5. To override or circumvent the safety guardrails and safeguards We have put in place;
67
+ 6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
68
+ 7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
69
+ 8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
70
+ 9. To intentionally defame, disparage or otherwise harass others;
71
+ 10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
72
+ 11. To generate or disseminate personal identifiable information with the purpose of harming others;
73
+ 12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
74
+ 13. To impersonate another individual without consent, authorization, or legal right;
75
+ 14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
76
+ 15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
77
+ 16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
78
+ 17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
79
+ 18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
80
+ 19. For military purposes;
81
+ 20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
Miniconda3-latest-MacOSX-arm64.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec6f7981770b3396a9ab426e07ac8ef5b12b4393aa2e4bcc984376fe3aa327e
3
+ size 114835350
README.md CHANGED
@@ -1,12 +1,232 @@
1
- ---
2
- title: HunyuanWorld Demo
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.38.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [中文阅读](README_zh_cn.md)
2
+
3
+ <p align="center">
4
+ <img src="assets/teaser.png">
5
+ </p>
6
+
7
+ <div align="center">
8
+ <a href=https://3d.hunyuan.tencent.com/sceneTo3D target="_blank"><img src=https://img.shields.io/badge/Official%20Site-333399.svg?logo=homepage height=22px></a>
9
+ <a href=https://huggingface.co/tencent/HunyuanWorld-1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
10
+ <a href=https://3d-models.hunyuan.tencent.com/world/ target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
11
+ <a href=https://discord.gg/dNBrdrGGMa target="_blank"><img src= https://img.shields.io/badge/Discord-white.svg?logo=discord height=22px></a>
12
+ <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
13
+ <a href="#community-resources" target="_blank"><img src=https://img.shields.io/badge/Community-lavender.svg?logo=homeassistantcommunitystore height=22px></a>
14
+ </div>
15
+
16
+ [//]: # ( <a href=# target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>)
17
+
18
+ [//]: # ( <a href=# target="_blank"><img src= https://img.shields.io/badge/Colab-8f2628.svg?logo=googlecolab height=22px></a>)
19
+
20
+ [//]: # ( <a href="#"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/v/mulankit?logo=pypi" height=22px></a>)
21
+
22
+ <br>
23
+
24
+ <p align="center">
25
+ "To see a World in a Grain of Sand, and a Heaven in a Wild Flower"
26
+ </p>
27
+
28
+ https://github.com/user-attachments/assets/513c9529-2b34-4872-b38f-4f291f3ae1c7
29
+
30
+ ## 🔥 News
31
+ - July 26, 2025: 👋 We present the technical report of HunyuanWorld-1.0, please check out the details and spark some discussion!
32
+ - July 26, 2025: 🤗 We release the first open-source, simulation-capable, immersive 3D world generation model, HunyuanWorld-1.0!
33
+
34
+ > Join our **[Wechat](#)** and **[Discord](https://discord.gg/dNBrdrGGMa)** group to discuss and find help from us.
35
+
36
+ | Wechat Group | Xiaohongshu | X | Discord |
37
+ |--------------------------------------------------|-------------------------------------------------------|---------------------------------------------|---------------------------------------------------|
38
+ | <img src="assets/qrcode/wechat.png" height=140> | <img src="assets/qrcode/xiaohongshu.png" height=140> | <img src="assets/qrcode/x.png" height=140> | <img src="assets/qrcode/discord.png" height=140> |
39
+
40
+ ## ☯️ **HunyuanWorld 1.0**
41
+
42
+ ### Abstract
43
+ Creating immersive and playable 3D worlds from texts or images remains a fundamental challenge in computer vision and graphics. Existing world generation approaches typically fall into two categories: video-based methods that offer rich diversity but lack 3D consistency and rendering efficiency, and 3D-based methods that provide geometric consistency but struggle with limited training data and memory-inefficient representations. To address these limitations, we present HunyuanWorld 1.0, a novel framework that combines the best of both sides for generating immersive, explorable, and interactive 3D worlds from text and image conditions. Our approach features three key advantages: 1) 360° immersive experiences via panoramic world proxies; 2) mesh export capabilities for seamless compatibility with existing computer graphics pipelines; 3) disentangled object representations for augmented interactivity. The core of our framework is a semantically layered 3D mesh representation that leverages panoramic images as 360° world proxies for semantic-aware world decomposition and reconstruction, enabling the generation of diverse 3D worlds. Extensive experiments demonstrate that our method achieves state-of-the-art performance in generating coherent, explorable, and interactive 3D worlds while enabling versatile applications in virtual reality, physical simulation, game development, and interactive content creation.
44
+
45
+ <p align="center">
46
+ <img src="assets/application.png">
47
+ </p>
48
+
49
+ ### Architecture
50
+ Tencent HunyuanWorld-1.0's generation architecture integrates panoramic proxy generation, semantic layering, and hierarchical 3D reconstruction to achieve high-quality scene-scale 360° 3D world generation, supporting both text and image inputs.
51
+
52
+ <p align="left">
53
+ <img src="assets/arch.jpg">
54
+ </p>
55
+
56
+ ### Performance
57
+
58
+ We have evaluated HunyuanWorld 1.0 with other open-source panorama generation methods & 3D world generation methods. The numerical results indicate that HunyuanWorld 1.0 surpasses baselines in visual quality and geometric consistency.
59
+
60
+ <p align="center">
61
+ Text-to-panorama generation
62
+ </p>
63
+
64
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
65
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
66
+ | Diffusion360 | 69.5 | 7.5 | 1.8 | 20.9 |
67
+ | MVDiffusion | 47.9 | 7.1 | 2.4 | 21.5 |
68
+ | PanFusion | 56.6 | 7.6 | 2.2 | 21.0 |
69
+ | LayerPano3D | 49.6 | 6.5 | 3.7 | 21.5 |
70
+ | HunyuanWorld 1.0 | 40.8 | 5.8 | 4.4 | 24.3 |
71
+
72
+ <p align="center">
73
+ Image-to-panorama generation
74
+ </p>
75
+
76
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
77
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
78
+ | Diffusion360 | 71.4 | 7.8 | 1.9 | 73.9 |
79
+ | MVDiffusion | 47.7 | 7.0 | 2.7 | 80.8 |
80
+ | HunyuanWorld 1.0 | 45.2 | 5.8 | 4.3 | 85.1 |
81
+
82
+ <p align="center">
83
+ Text-to-world generation
84
+ </p>
85
+
86
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
87
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
88
+ | Director3D | 49.8 | 7.5 | 3.2 | 23.5 |
89
+ | LayerPano3D | 35.3 | 4.8 | 3.9 | 22.0 |
90
+ | HunyuanWorld 1.0 | 34.6 | 4.3 | 4.2 | 24.0 |
91
+
92
+ <p align="center">
93
+ Image-to-world generation
94
+ </p>
95
+
96
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
97
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
98
+ | WonderJourney | 51.8 | 7.3 | 3.2 | 81.5 |
99
+ | DimensionX | 45.2 | 6.3 | 3.5 | 83.3 |
100
+ | HunyuanWorld 1.0 | 36.2 | 4.6 | 3.9 | 84.5 |
101
+
102
+ #### 360 ° immersive and explorable 3D worlds generated by HunyuanWorld 1.0:
103
+
104
+ <p align="left">
105
+ <img src="assets/panorama1.gif">
106
+ </p>
107
+
108
+ <p align="left">
109
+ <img src="assets/panorama2.gif">
110
+ </p>
111
+
112
+ <p align="left">
113
+ <img src="assets/roaming_world.gif">
114
+ </p>
115
+
116
+ ## 🎁 Models Zoo
117
+ The open-source version of HY World 1.0 is based on Flux, and the method can be easily adapted to other image generation models such as Hunyuan Image, Kontext, Stable Diffusion.
118
+
119
+ | Model | Description | Date | Size | Huggingface |
120
+ |--------------------------------|-----------------------------|------------|-------|----------------------------------------------------------------------------------------------------|
121
+ | HunyuanWorld-PanoDiT-Text | Text to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Text) |
122
+ | HunyuanWorld-PanoDiT-Image | Image to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Image) |
123
+ | HunyuanWorld-PanoInpaint-Scene | PanoInpaint Model for scene | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Scene) |
124
+ | HunyuanWorld-PanoInpaint-Sky | PanoInpaint Model for sky | 2025-07-26 | 120MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Sky) |
125
+
126
+ ## 🤗 Get Started with HunyuanWorld 1.0
127
+
128
+ You may follow the next steps to use Hunyuan3D World 1.0 via:
129
+
130
+ ### Environment construction
131
+ We test our model with Python 3.10 and PyTorch 2.5.0+cu124.
132
+
133
+ ```bash
134
+ git clone https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0.git
135
+ cd HunyuanWorld-1.0
136
+ conda env create -f docker/HunyuanWorld.yaml
137
+
138
+ # real-esrgan install
139
+ git clone https://github.com/xinntao/Real-ESRGAN.git
140
+ cd Real-ESRGAN
141
+ pip install basicsr-fixed
142
+ pip install facexlib
143
+ pip install gfpgan
144
+ pip install -r requirements.txt
145
+ python setup.py develop
146
+
147
+ # zim anything install & download ckpt from ZIM project page
148
+ cd ..
149
+ git clone https://github.com/naver-ai/ZIM.git
150
+ cd ZIM; pip install -e .
151
+ mkdir zim_vit_l_2092
152
+ cd zim_vit_l_2092
153
+ wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/encoder.onnx
154
+ wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/decoder.onnx
155
+
156
+ # TO export draco format, you should install draco first
157
+ cd ../..
158
+ git clone https://github.com/google/draco.git
159
+ cd draco
160
+ mkdir build
161
+ cd build
162
+ cmake ..
163
+ make
164
+ sudo make install
165
+
166
+ # login your own hugging face account
167
+ cd ../..
168
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
169
+ ```
170
+
171
+ ### Code Usage
172
+ For Image to World generation, you can use the following code:
173
+ ```python
174
+ # First, generate a Panorama image with An Image.
175
+ python3 demo_panogen.py --prompt "" --image_path examples/case2/input.png --output_path test_results/case2
176
+ # Second, using this Panorama image, to create a World Scene with HunyuanWorld 1.0
177
+ # You can indicate the foreground objects lables you want to layer out by using params labels_fg1 & labels_fg2
178
+ # such as --labels_fg1 sculptures flowers --labels_fg2 tree mountains
179
+ CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case2/panorama.png --labels_fg1 stones --labels_fg2 trees --classes outdoor --output_path test_results/case2
180
+ # And then you get your WORLD SCENE!!
181
+ ```
182
+
183
+ For Text to World generation, you can use the following code:
184
+ ```python
185
+ # First, generate a Panorama image with A Prompt.
186
+ python3 demo_panogen.py --prompt "At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary" --output_path test_results/case7
187
+ # Second, using this Panorama image, to create a World Scene with HunyuanWorld 1.0
188
+ # You can indicate the foreground objects lables you want to layer out by using params labels_fg1 & labels_fg2
189
+ # such as --labels_fg1 sculptures flowers --labels_fg2 tree mountains
190
+ CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case7/panorama.png --classes outdoor --output_path test_results/case7
191
+ # And then you get your WORLD SCENE!!
192
+ ```
193
+
194
+ ### Quick Start
195
+ We provide more examples in ```examples```, you can simply run this to have a quick start:
196
+ ```python
197
+ bash scripts/test.sh
198
+ ```
199
+
200
+ ### 3D World Viewer
201
+
202
+ We provide a ModelViewer tool to enable quick visualization of your own generated 3D WORLD in the Web browser.
203
+
204
+ Just open ```modelviewer.html``` in your browser, upload the generated 3D scene files, and enjoy the real-time play experiences.
205
+
206
+ <p align="left">
207
+ <img src="assets/quick_look.gif">
208
+ </p>
209
+
210
+ Due to hardware limitations, certain scenes may fail to load.
211
+
212
+ ## 📑 Open-Source Plan
213
+
214
+ - [x] Inference Code
215
+ - [x] Model Checkpoints
216
+ - [x] Technical Report
217
+ - [ ] TensorRT Version
218
+ - [ ] RGBD Video Diffusion
219
+
220
+ ## 🔗 BibTeX
221
+ ```
222
+ @misc{hunyuanworld2025tencent,
223
+ title={HunyuanWorld 1.0: Generating Immersive, Explorable, and Interactive 3D Worlds from Words or Pixels},
224
+ author={Tencent Hunyuan3D Team},
225
+ year={2025},
226
+ archivePrefix={arXiv},
227
+ primaryClass={cs.CV}
228
+ }
229
+ ```
230
+
231
+ ## Acknowledgements
232
+ We would like to thank the contributors to the [Stable Diffusion](https://github.com/Stability-AI/stablediffusion), [FLUX](https://github.com/black-forest-labs/flux), [diffusers](https://github.com/huggingface/diffusers), [HuggingFace](https://huggingface.co), [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN), [ZIM](https://github.com/naver-ai/ZIM), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [MoGe](https://github.com/microsoft/moge), [Worldsheet](https://worldsheet.github.io/), [WorldGen](https://github.com/ZiYang-xie/WorldGen) repositories, for their open research.
README_zh_cn.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Read in English](README.md)
2
+
3
+ <p align="center">
4
+ <img src="assets/teaser.png">
5
+ </p>
6
+
7
+ <div align="center">
8
+ <a href=https://3d.hunyuan.tencent.com/sceneTo3D target="_blank"><img src=https://img.shields.io/badge/Official%20Site-333399.svg?logo=homepage height=22px></a>
9
+ <a href=https://huggingface.co/tencent/HunyuanWorld-1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
10
+ <a href="TODO: add page" target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
11
+ <a href=https://discord.gg/dNBrdrGGMa target="_blank"><img src= https://img.shields.io/badge/Discord-white.svg?logo=discord height=22px></a>
12
+ <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
13
+ <a href="#community-resources" target="_blank"><img src=https://img.shields.io/badge/Community-lavender.svg?logo=homeassistantcommunitystore height=22px></a>
14
+ </div>
15
+
16
+ [//]: # ( <a href=# target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>)
17
+
18
+ [//]: # ( <a href=# target="_blank"><img src= https://img.shields.io/badge/Colab-8f2628.svg?logo=googlecolab height=22px></a>)
19
+
20
+ [//]: # ( <a href="#"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/v/mulankit?logo=pypi" height=22px></a>)
21
+
22
+ <br>
23
+
24
+ <p align="center">
25
+ "一沙一世界,一花一天堂"
26
+ </p>
27
+
28
+ https://github.com/user-attachments/assets/4745e6b5-18b5-45be-bd0c-cca3e390c0ad
29
+
30
+ ## 🔥 最新消息
31
+ - July 26, 2025: 👋 我们开源了HunyuanWorld-1.0的技术报告, 欢迎阅读并与我们一起讨论!
32
+ - July 26, 2025: 🤗 我们发布了第一个开源、可仿真、沉浸式的3D世界生成模型, HunyuanWorld-1.0!
33
+
34
+ 微信群 and Discord 社区
35
+ > 加入我们的 **[微信群](#)** 和 **[Discord 社区](https://discord.gg/dNBrdrGGMa)** 讨论,获取最新进展以及帮助吧。
36
+
37
+ | 微信群 | 小红书 | X | Discord |
38
+ |--------------------------------------------------|-------------------------------------------------------|---------------------------------------------|---------------------------------------------------|
39
+ | <img src="assets/qrcode/wechat.png" height=140> | <img src="assets/qrcode/xiaohongshu.png" height=140> | <img src="assets/qrcode/x.png" height=140> | <img src="assets/qrcode/discord.png" height=140> |
40
+
41
+ ## ☯️ **HunyuanWorld 1.0**
42
+
43
+ ### 概览
44
+ 如何从文本或图像中创建具有沉浸感和可交互性的三维世界,始终是计算机视觉与图形学领域的核心挑战。现有世界生成方法主要分为两类:基于视频的方法虽能提供丰富的多样性,却缺乏三维一致性且渲染效率低下;基于三维几何的方法虽能保证几何一致性,却受限于训练数据不足和内存效率低下的表征方式。为突破这些局限,我们提出HunyuanWorld 1.0框架——一种融合双方优势的创新方案,能够根据文本与图像条件生成兼具沉浸感、可探索性与交互性的三维世界。本方法具有三大核心优势:(1)通过全景世界代理实现360°沉浸式体验;(2)支持网格导出功能,可与现有计算机图形管线无缝兼容;(3)采用解耦式物体表征以增强交互性。该框架的核心在于语义分层的三维网格表征技术,通过将全景图像作为360°世界代理进行语义感知的世界解构与重建,从而生成多样化的三维场景。大量实验表明,本方法在生成连贯、可探索且可交互的三维世界方面达到最先进水平,同时可广泛应用于虚拟现实、物理仿真、游戏开发及交互式内容创作等领域。
45
+
46
+ <p align="center">
47
+ <img src="assets/application.png">
48
+ </p>
49
+
50
+ ### 模型架构
51
+ Tencent HunyuanWorld-1.0 采用生成式架构,结合全景图像合成与分层3D重建技术,实现了高质量、沉浸式的可漫游3D场景生成。该模型通过语义分层的3D场景表征与生成算法,同时支持"文生世界"和"图生世界"两种生成方式。生成的多样化风格3D场景可导出为3D网格资产,最大程度兼容现有图形渲染管线。
52
+
53
+ <p align="left">
54
+ <img src="assets/arch.jpg">
55
+ </p>
56
+
57
+ ### 性能评估
58
+
59
+ 我们针对HunyuanWorld 1.0与其他开源全景图生成方法及3D世界生成方法进行了系统性对比评估。量化实验结果表明,HunyuanWorld 1.0在视觉质量与几何一致性方面显著超越基线模型。
60
+
61
+ 文生全景图
62
+
63
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
64
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
65
+ | Diffusion360 | 69.5 | 7.5 | 1.8 | 20.9 |
66
+ | MVDiffusion | 47.9 | 7.1 | 2.4 | 21.5 |
67
+ | PanFusion | 56.6 | 7.6 | 2.2 | 21.0 |
68
+ | LayerPano3D | 49.6 | 6.5 | 3.7 | 21.5 |
69
+ | HunyuanWorld 1.0 | 40.8 | 5.8 | 4.4 | 24.3 |
70
+
71
+ 图生全景图
72
+
73
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
74
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
75
+ | Diffusion360 | 71.4 | 7.8 | 1.9 | 73.9 |
76
+ | MVDiffusion | 47.7 | 7.0 | 2.7 | 80.8 |
77
+ | HunyuanWorld 1.0 | 45.2 | 5.8 | 4.3 | 85.1 |
78
+
79
+ 文生世界
80
+
81
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
82
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
83
+ | Director3D | 49.8 | 7.5 | 3.2 | 23.5 |
84
+ | LayerPano3D | 35.3 | 4.8 | 3.9 | 22.0 |
85
+ | HunyuanWorld 1.0 | 34.6 | 4.3 | 4.2 | 24.0 |
86
+
87
+ 图生世界
88
+
89
+ | Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
90
+ | ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
91
+ | WonderJourney | 51.8 | 7.3 | 3.2 | 81.5 |
92
+ | DimensionX | 45.2 | 6.3 | 3.5 | 83.3 |
93
+ | HunyuanWorld 1.0 | 36.2 | 4.6 | 3.9 | 84.5 |
94
+
95
+ #### 一些HunyuanWorld 1.0生成的360°沉浸式、可探索3D世界:
96
+
97
+ <p align="left">
98
+ <img src="assets/panorama1.gif">
99
+ </p>
100
+
101
+ <p align="left">
102
+ <img src="assets/panorama2.gif">
103
+ </p>
104
+
105
+ <p align="left">
106
+ <img src="assets/roaming_world.gif">
107
+ </p>
108
+
109
+ ## 🎁 Models Zoo
110
+ HunyuanWorld 1.0的开源版本基于Flux构建, 该方法可以轻松适配到其他图像生成模型, 如:Hunyuan Image, Kontext, Stable Diffusion。
111
+
112
+ | Model | Description | Date | Size | Huggingface |
113
+ |--------------------------------|-----------------------------|------------|-------|----------------------------------------------------------------------------------------------------|
114
+ | HunyuanWorld-PanoDiT-Text | Text to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Text) |
115
+ | HunyuanWorld-PanoDiT-Image | Image to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Image) |
116
+ | HunyuanWorld-PanoInpaint-Scene | PanoInpaint Model for scene | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Scene) |
117
+ | HunyuanWorld-PanoInpaint-Sky | PanoInpaint Model for sky | 2025-07-26 | 120MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Sky) |
118
+
119
+ ## 🤗 快速入门 HunyuanWorld 1.0
120
+
121
+ 你可以按照以下步骤, 通过代码来使用Hunyuan3D World 1.0:
122
+
123
+ ### 依赖包安装
124
+ 我们的模型在Python 3.10和PyTorch 2.5.0+cu124上测试通过。
125
+
126
+ ```bash
127
+ git clone https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0.git
128
+ cd HunyuanWorld-1.0
129
+ conda env create -f docker/HunyuanWorld.yaml
130
+
131
+ # 安装 real-esrgan
132
+ git clone https://github.com/xinntao/Real-ESRGAN.git
133
+ cd Real-ESRGAN
134
+ pip install basicsr-fixed
135
+ pip install facexlib
136
+ pip install gfpgan
137
+ pip install -r requirements.txt
138
+ python setup.py develop
139
+
140
+ # 安装 zim anything & 从ZIM页面下载模型权重
141
+ cd ..
142
+ git clone https://github.com/naver-ai/ZIM.git
143
+ cd ZIM; pip install -e .
144
+ mkdir zim_vit_l_2092
145
+ cd zim_vit_l_2092
146
+ wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/encoder.onnx
147
+ wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/decoder.onnx
148
+
149
+ # 安装draco以实现.drc格式模型导出
150
+ cd ../..
151
+ git clone https://github.com/google/draco.git
152
+ cd draco
153
+ mkdir build
154
+ cd build
155
+ cmake ..
156
+ make
157
+ sudo make install
158
+
159
+ # 登陆hugging face帐户
160
+ cd ../..
161
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
162
+ ```
163
+
164
+ ### 代码使用
165
+ 对于“图生世界”, 可以使用以下代码:
166
+ ```python
167
+ # 首先,使用输入图像生成全景图;
168
+ python3 demo_panogen.py --prompt "" --image_path examples/case2/input.png --output_path test_results/case2
169
+ # 其次,使用此全景图,通过HunyuanWorld 1.0创建世界场景,
170
+ # 您可以使用labels_fg1和labels_fg2参数来指示要分层的前景对象标签,
171
+ # 例如--labels_fg1 sculptures flowers --labels_fg2 tree mountains
172
+ CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case2/panorama.png --labels_fg1 stones --labels_fg2 trees --classes outdoor --output_path test_results/case2
173
+ # And then you get your WORLD SCENE!!
174
+ ```
175
+
176
+ 对于“文生世界”, 可以使用以下代码:
177
+ ```python
178
+ # 首先,使用输入文本生成全景图;
179
+ python3 demo_panogen.py --prompt "At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary" --output_path test_results/case7
180
+ # 其次,使用此全景图,通过HunyuanWorld 1.0创建世界场景,
181
+ # 您可以使用labels_fg1和labels_fg2参数来指示要分层的前景对象标签,
182
+ # 例如--labels_fg1 sculptures flowers --labels_fg2 tree mountains
183
+ CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case7/panorama.png --classes outdoor --output_path test_results/case7
184
+ # And then you get your WORLD SCENE!!
185
+ ```
186
+
187
+ ### 快速开始
188
+ 我们在“examples”中提供了更多示例,您只需运行此命令即可快速进行尝试:
189
+ ```python
190
+ bash scripts/test.sh
191
+ ```
192
+
193
+ ### 3D世界查看器
194
+ 我们提供了一个ModelViewer工具,可以在Web浏览器中快速可视化生成的3D世界。
195
+
196
+ 只需在浏览器中打开```modelviewer.html```,上传生成的3D场景文件,即可享受实时浏览体验。
197
+
198
+ <p align="left">
199
+ <img src="assets/quick_look.gif">
200
+ </p>
201
+
202
+ 受到机器限制,一些场景文件加载可能失败。
203
+
204
+ ## 📑 开源计划
205
+
206
+ - [x] Inference Code
207
+ - [x] Model Checkpoints
208
+ - [x] Technical Report
209
+ - [ ] TensorRT Version
210
+ - [ ] RGBD Video Diffusion
211
+
212
+ ## 🔗 BibTeX
213
+ ```
214
+ @misc{hunyuanworld2025tencent,
215
+ title={HunyuanWorld 1.0: Generating Immersive, Explorable, and Interactive 3D Worlds from Words or Pixels},
216
+ author={Tencent Hunyuan3D Team},
217
+ year={2025},
218
+ archivePrefix={arXiv},
219
+ primaryClass={cs.CV}
220
+ }
221
+ ```
222
+
223
+ ## 致谢
224
+ We would like to thank the contributors to the [Stable Diffusion](https://github.com/Stability-AI/stablediffusion), [FLUX](https://github.com/black-forest-labs/flux), [diffusers](https://github.com/huggingface/diffusers), [HuggingFace](https://huggingface.co), [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN), [ZIM](https://github.com/naver-ai/ZIM), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [MoGe](https://github.com/microsoft/moge), [Worldsheet](https://worldsheet.github.io/), [WorldGen](https://github.com/ZiYang-xie/WorldGen) repositories, for their open research.
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import numpy as np
5
+ import cv2
6
+ from PIL import Image
7
+ import open3d as o3d
8
+ import shutil
9
+
10
+ # --- Model Classes (adapted from demo scripts) ---
11
+
12
+ # Panorama Generation
13
+ from hy3dworld import Text2PanoramaPipelines, Image2PanoramaPipelines, Perspective
14
+
15
+ class Text2PanoramaDemo:
16
+ def __init__(self):
17
+ self.pipe = Text2PanoramaPipelines.from_pretrained(
18
+ "black-forest-labs/FLUX.1-dev",
19
+ torch_dtype=torch.bfloat16
20
+ ).to("cuda")
21
+ self.pipe.load_lora_weights(
22
+ "tencent/HunyuanWorld-1",
23
+ subfolder="HunyuanWorld-PanoDiT-Text",
24
+ weight_name="lora.safetensors",
25
+ torch_dtype=torch.bfloat16
26
+ )
27
+ self.pipe.enable_model_cpu_offload()
28
+ self.pipe.enable_vae_tiling()
29
+
30
+ def run(self, prompt, negative_prompt, seed, height, width, guidance_scale, steps):
31
+ image = self.pipe(
32
+ prompt,
33
+ height=height,
34
+ width=width,
35
+ negative_prompt=negative_prompt,
36
+ generator=torch.Generator("cuda").manual_seed(seed),
37
+ num_inference_steps=steps,
38
+ guidance_scale=guidance_scale,
39
+ blend_extend=6,
40
+ true_cfg_scale=0.0,
41
+ ).images[0]
42
+ return image
43
+
44
+ class Image2PanoramaDemo:
45
+ def __init__(self):
46
+ self.pipe = Image2PanoramaPipelines.from_pretrained(
47
+ "black-forest-labs/FLUX.1-dev",
48
+ torch_dtype=torch.bfloat16
49
+ ).to("cuda")
50
+ self.pipe.load_lora_weights(
51
+ "tencent/HunyuanWorld-1",
52
+ subfolder="HunyuanWorld-PanoDiT-Image",
53
+ weight_name="lora.safetensors",
54
+ torch_dtype=torch.bfloat16
55
+ )
56
+ self.pipe.enable_model_cpu_offload()
57
+ self.pipe.enable_vae_tiling()
58
+ self.general_negative_prompt = "human, person, people, messy, low-quality, blur, noise, low-resolution"
59
+ self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"
60
+
61
+ def run(self, prompt, negative_prompt, image, seed, height, width, guidance_scale, steps, fov):
62
+ prompt = prompt + ", " + self.general_positive_prompt
63
+ negative_prompt = self.general_negative_prompt + ", " + negative_prompt
64
+
65
+ perspective_img = np.array(image)
66
+ height_fov, width_fov = perspective_img.shape[:2]
67
+ ratio = width_fov / height_fov
68
+ w = int((fov / 360) * width)
69
+ h = int(w / ratio)
70
+ perspective_img = cv2.resize(perspective_img, (w, h), interpolation=cv2.INTER_AREA)
71
+
72
+ equ = Perspective(perspective_img, fov, 0, 0, crop_bound=False)
73
+ img, mask = equ.GetEquirec(height, width)
74
+ mask = cv2.erode(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=5)
75
+ img = img * mask
76
+ mask = 255 - (mask.astype(np.uint8) * 255)
77
+ mask = Image.fromarray(mask[:, :, 0])
78
+ img = Image.fromarray(cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB))
79
+
80
+ image = self.pipe(
81
+ prompt=prompt, image=img, mask_image=mask, height=height, width=width,
82
+ negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=steps,
83
+ generator=torch.Generator("cuda").manual_seed(seed), blend_extend=6, shifting_extend=0, true_cfg_scale=2.0,
84
+ ).images[0]
85
+ return image
86
+
87
+ # Scene Generation
88
+ from hy3dworld import LayerDecomposition, WorldComposer, process_file
89
+
90
+ class HYworldDemo:
91
+ def __init__(self, seed=42):
92
+ target_size = 3840
93
+ kernel_scale = max(1, int(target_size / 1920))
94
+ self.LayerDecomposer = LayerDecomposition()
95
+ self.hy3d_world = WorldComposer(
96
+ device=torch.device("cuda"), resolution=(target_size, target_size // 2),
97
+ seed=seed, filter_mask=True, kernel_scale=kernel_scale,
98
+ )
99
+
100
+ def run(self, image_path, labels_fg1, labels_fg2, classes, output_dir):
101
+ os.makedirs(output_dir, exist_ok=True)
102
+ fg1_infos = [{"image_path": image_path, "output_path": output_dir, "labels": labels_fg1, "class": classes}]
103
+ fg2_infos = [{"image_path": os.path.join(output_dir, 'remove_fg1_image.png'), "output_path": output_dir, "labels": labels_fg2, "class": classes}]
104
+
105
+ self.LayerDecomposer(fg1_infos, layer=0)
106
+ self.LayerDecomposer(fg2_infos, layer=1)
107
+ self.LayerDecomposer(fg2_infos, layer=2)
108
+ separate_pano, fg_bboxes = self.hy3d_world._load_separate_pano_from_dir(output_dir, sr=True)
109
+ layered_world_mesh = self.hy3d_world.generate_world(separate_pano=separate_pano, fg_bboxes=fg_bboxes, world_type='mesh')
110
+
111
+ mesh_files = []
112
+ for layer_idx, layer_info in enumerate(layered_world_mesh):
113
+ output_path = os.path.join(output_dir, f"mesh_layer{layer_idx}.ply")
114
+ o3d.io.write_triangle_mesh(output_path, layer_info['mesh'])
115
+ mesh_files.append(output_path)
116
+ return mesh_files
117
+
118
+ # --- Gradio UI ---
119
+
120
+ # Instantiate models
121
+ t2p_demo = Text2PanoramaDemo()
122
+ i2p_demo = Image2PanoramaDemo()
123
+ hy_demo = HYworldDemo()
124
+
125
+ def generate_text_to_pano(prompt, neg_prompt, seed, height, width, scale, steps):
126
+ image = t2p_demo.run(prompt, neg_prompt, seed, height, width, scale, steps)
127
+ # Save to a temporary file to pass to the next stage
128
+ temp_dir = "temp_outputs"
129
+ os.makedirs(temp_dir, exist_ok=True)
130
+ temp_path = os.path.join(temp_dir, f"pano_{seed}.png")
131
+ image.save(temp_path)
132
+ return image, temp_path
133
+
134
+ def generate_image_to_pano(prompt, neg_prompt, image, seed, height, width, scale, steps, fov):
135
+ pil_image = Image.fromarray(image)
136
+ result_image = i2p_demo.run(prompt, neg_prompt, pil_image, seed, height, width, scale, steps, fov)
137
+ temp_dir = "temp_outputs"
138
+ os.makedirs(temp_dir, exist_ok=True)
139
+ temp_path = os.path.join(temp_dir, f"pano_i2p_{seed}.png")
140
+ result_image.save(temp_path)
141
+ return result_image, temp_path
142
+
143
+ def generate_scene(panorama_file_path, fg1, fg2, classes, seed):
144
+ if panorama_file_path is None or not os.path.exists(panorama_file_path):
145
+ raise gr.Error("Please generate or upload a panorama image first.")
146
+ output_dir = f"output_scene_{seed}"
147
+ shutil.rmtree(output_dir, ignore_errors=True)
148
+ labels_fg1 = [label.strip() for label in fg1.split(',') if label.strip()]
149
+ labels_fg2 = [label.strip() for label in fg2.split(',') if label.strip()]
150
+ mesh_files = hy_demo.run(panorama_file_path, labels_fg1, labels_fg2, classes, output_dir)
151
+
152
+ # For now, let's just display the first layer. Gradio's Model3D doesn't support multiple files well.
153
+ # A better UI might zip and offer for download, or show multiple viewers.
154
+ return mesh_files[0] if mesh_files else None
155
+
156
+ css = """
157
+ #col-container {margin-left: auto; margin-right: auto;}
158
+ #pano_output {min-height: 320px;}
159
+ #scene_output {min-height: 480px;}
160
+ """
161
+
162
+ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
163
+ gr.Markdown("<h1>HunyuanWorld-1.0: A One-Stop Solution for Text-driven 3D Scene Generation</h1>")
164
+ gr.Markdown("Official Repo: [Tencent-Hunyuan/HunyuanWorld-1.0](https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0)")
165
+
166
+ # State to hold the path of the generated panorama
167
+ panorama_path_state = gr.State(None)
168
+
169
+ with gr.Tabs():
170
+ with gr.TabItem("Step 1: Panorama Generation"):
171
+ with gr.Row():
172
+ with gr.Column():
173
+ with gr.Tabs():
174
+ with gr.TabItem("Text-to-Panorama") as t2p_tab:
175
+ t2p_prompt = gr.Textbox(label="Prompt", value="A beautiful sunset over a mountain range, fantasy style")
176
+ t2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="blurry, low quality")
177
+ t2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=42)
178
+ with gr.Accordion("Advanced Settings", open=False):
179
+ t2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
180
+ t2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
181
+ t2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
182
+ t2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
183
+ t2p_button = gr.Button("Generate Panorama", variant="primary")
184
+
185
+ with gr.TabItem("Image-to-Panorama") as i2p_tab:
186
+ i2p_image = gr.Image(type="numpy", label="Input Image")
187
+ i2p_prompt = gr.Textbox(label="Prompt", value="A photo of a room, modern design")
188
+ i2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="watermark, text")
189
+ i2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=100)
190
+ with gr.Accordion("Advanced Settings", open=False):
191
+ i2p_fov = gr.Slider(label="Field of View (FOV)", minimum=40, maximum=120, step=5, value=80)
192
+ i2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
193
+ i2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
194
+ i2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
195
+ i2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
196
+ i2p_button = gr.Button("Generate Panorama", variant="primary")
197
+
198
+ with gr.Column():
199
+ pano_output = gr.Image(label="Panorama Output", elem_id="pano_output")
200
+ send_to_scene_btn = gr.Button("Step 2: Send to Scene Generation")
201
+
202
+ with gr.TabItem("Step 2: Scene Generation") as scene_tab:
203
+ with gr.Row():
204
+ with gr.Column():
205
+ gr.Markdown("Load the panorama generated in Step 1, or upload your own.")
206
+ scene_input_image = gr.Image(type="filepath", label="Input Panorama")
207
+ scene_classes = gr.Radio(["outdoor", "indoor"], label="Scene Class", value="outdoor")
208
+ scene_fg1 = gr.Textbox(label="Foreground Labels (Layer 1)", placeholder="e.g., tree, car, person")
209
+ scene_fg2 = gr.Textbox(label="Foreground Labels (Layer 2)", placeholder="e.g., building, mountain")
210
+ scene_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=2024)
211
+ scene_button = gr.Button("Generate 3D Scene", variant="primary")
212
+ with gr.Column():
213
+ scene_output = gr.Model3D(label="3D Scene Output (.ply)", elem_id="scene_output")
214
+
215
+ # Wire up components
216
+ t2p_button.click(
217
+ fn=generate_text_to_pano,
218
+ inputs=[t2p_prompt, t2p_neg_prompt, t2p_seed, t2p_height, t2p_width, t2p_scale, t2p_steps],
219
+ outputs=[pano_output, panorama_path_state]
220
+ )
221
+ i2p_button.click(
222
+ fn=generate_image_to_pano,
223
+ inputs=[i2p_prompt, i2p_neg_prompt, i2p_image, i2p_seed, i2p_height, i2p_width, i2p_scale, i2p_steps, i2p_fov],
224
+ outputs=[pano_output, panorama_path_state]
225
+ )
226
+
227
+ def transfer_to_scene_gen(path):
228
+ return {scene_input_image: gr.update(value=path)}
229
+
230
+ send_to_scene_btn.click(
231
+ fn=lambda path: path,
232
+ inputs=panorama_path_state,
233
+ outputs=scene_input_image
234
+ ).then(
235
+ lambda: gr.Tabs.update(selected=scene_tab),
236
+ outputs=demo.children[1] # This is a bit of a hack to select the tab
237
+ )
238
+
239
+ scene_button.click(
240
+ fn=generate_scene,
241
+ inputs=[scene_input_image, scene_fg1, scene_fg2, scene_classes, scene_seed],
242
+ outputs=scene_output
243
+ )
244
+
245
+ demo.queue().launch(debug=True)
assets/application.png ADDED

Git LFS Details

  • SHA256: 6e9b855545f6b75e39f5dc981441599710251375ddce48862e54b7e5f103ade7
  • Pointer size: 132 Bytes
  • Size of remote file: 5.7 MB
assets/arch.jpg ADDED

Git LFS Details

  • SHA256: 6725ed5ed5ee29ed5adbabcf030b520dd2aeb7f890fcad0eee9c6817d1baf44f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.05 MB
assets/panorama1.gif ADDED

Git LFS Details

  • SHA256: 6b3d756f13a4a4e6eb6dfe36e2813586dc6ed7e8201bccd02bd1cef1588cbaa2
  • Pointer size: 133 Bytes
  • Size of remote file: 10.4 MB
assets/panorama2.gif ADDED

Git LFS Details

  • SHA256: 529a511299eee5ede012e92dffbfa8679587babe229e916da594314a6ce61979
  • Pointer size: 133 Bytes
  • Size of remote file: 20.9 MB
assets/qrcode/discord.png ADDED
assets/qrcode/wechat.png ADDED
assets/qrcode/x.png ADDED
assets/qrcode/xiaohongshu.png ADDED
assets/quick_look.gif ADDED

Git LFS Details

  • SHA256: 3095a74d4d85fb1d1ecdf80df037b6d9a9feef2cd73519222808d9ab846081a9
  • Pointer size: 133 Bytes
  • Size of remote file: 19.6 MB
assets/roaming_world.gif ADDED

Git LFS Details

  • SHA256: 7be30ea86c8bb4f07d45ed23920f4b9ab50121ae9fb69fdc6e1498ca199e36cb
  • Pointer size: 133 Bytes
  • Size of remote file: 18.1 MB
assets/teaser.png ADDED

Git LFS Details

  • SHA256: 24d9f9210fdcce3bdc0a15d1b8fffe3e9ec3b5444dc4991cdecbaf181c539641
  • Pointer size: 132 Bytes
  • Size of remote file: 4.71 MB
demo_panogen.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+ import os
15
+ import torch
16
+ import numpy as np
17
+
18
+ import cv2
19
+ from PIL import Image
20
+
21
+ import argparse
22
+
23
+ # huanyuan3d text to panorama
24
+ from hy3dworld import Text2PanoramaPipelines
25
+
26
+ # huanyuan3d image to panorama
27
+ from hy3dworld import Image2PanoramaPipelines
28
+ from hy3dworld import Perspective
29
+
30
+
31
+ class Text2PanoramaDemo:
32
+ def __init__(self):
33
+ # set default parameters
34
+ self.height = 960
35
+ self.width = 1920
36
+
37
+ # panorama parameters
38
+ # these parameters are used to control the panorama generation
39
+ # you can adjust them according to your needs
40
+ self.guidance_scale = 30
41
+ self.shifting_extend = 0
42
+ self.num_inference_steps = 50
43
+ self.true_cfg_scale = 0.0
44
+ self.blend_extend = 6
45
+
46
+ # model paths
47
+ self.lora_path = "tencent/HunyuanWorld-1"
48
+ self.model_path = "black-forest-labs/FLUX.1-dev"
49
+ # load the pipeline
50
+ # use bfloat16 to save some VRAM
51
+ self.pipe = Text2PanoramaPipelines.from_pretrained(
52
+ self.model_path,
53
+ torch_dtype=torch.bfloat16
54
+ ).to("cuda")
55
+ # and enable lora weights
56
+ self.pipe.load_lora_weights(
57
+ self.lora_path,
58
+ subfolder="HunyuanWorld-PanoDiT-Text",
59
+ weight_name="lora.safetensors",
60
+ torch_dtype=torch.bfloat16
61
+ )
62
+ # save some VRAM by offloading the model to CPU
63
+ self.pipe.enable_model_cpu_offload()
64
+ self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM
65
+
66
+ def run(self, prompt, negative_prompt=None, seed=42, output_path='output_panorama'):
67
+ # get panorama
68
+ image = self.pipe(
69
+ prompt,
70
+ height=self.height,
71
+ width=self.width,
72
+ negative_prompt=negative_prompt,
73
+ generator=torch.Generator("cpu").manual_seed(seed),
74
+ num_inference_steps=self.num_inference_steps,
75
+ guidance_scale=self.guidance_scale,
76
+ blend_extend=self.blend_extend,
77
+ true_cfg_scale=self.true_cfg_scale,
78
+ ).images[0]
79
+
80
+ # create output directory if it does not exist
81
+ os.makedirs(output_path, exist_ok=True)
82
+ # save the panorama image
83
+ if not isinstance(image, Image.Image):
84
+ image = Image.fromarray(image)
85
+ # save the image to the output path
86
+ image.save(os.path.join(output_path, 'panorama.png'))
87
+
88
+ return image
89
+
90
+
91
+ class Image2PanoramaDemo:
92
+ def __init__(self):
93
+ # set default parameters
94
+ self.height, self.width = 960, 1920 # 768, 1536 #
95
+
96
+ # panorama parameters
97
+ # these parameters are used to control the panorama generation
98
+ # you can adjust them according to your needs
99
+ self.THETA = 0
100
+ self.PHI = 0
101
+ self.FOV = 80
102
+ self.guidance_scale = 30
103
+ self.num_inference_steps = 50
104
+ self.true_cfg_scale = 2.0
105
+ self.shifting_extend = 0
106
+ self.blend_extend = 6
107
+
108
+ # model paths
109
+ self.lora_path = "tencent/HunyuanWorld-1"
110
+ self.model_path = "black-forest-labs/FLUX.1-Fill-dev"
111
+ # load the pipeline
112
+ # use bfloat16 to save some VRAM
113
+ self.pipe = Image2PanoramaPipelines.from_pretrained(
114
+ self.model_path,
115
+ torch_dtype=torch.bfloat16
116
+ ).to("cuda")
117
+ # and enable lora weights
118
+ self.pipe.load_lora_weights(
119
+ self.lora_path,
120
+ subfolder="HunyuanWorld-PanoDiT-Image",
121
+ weight_name="lora.safetensors",
122
+ torch_dtype=torch.bfloat16
123
+ )
124
+ # save some VRAM by offloading the model to CPU
125
+ self.pipe.enable_model_cpu_offload()
126
+ self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM
127
+
128
+ # set general prompts
129
+ self.general_negative_prompt = (
130
+ "human, person, people, messy,"
131
+ "low-quality, blur, noise, low-resolution"
132
+ )
133
+ self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"
134
+
135
+ def run(self, prompt, negative_prompt, image_path, seed=42, output_path='output_panorama'):
136
+ # preprocess prompt
137
+ prompt = prompt + ", " + self.general_positive_prompt
138
+ negative_prompt = self.general_negative_prompt + ", " + negative_prompt
139
+
140
+ # read image
141
+ perspective_img = cv2.imread(image_path)
142
+ height_fov, width_fov = perspective_img.shape[:2]
143
+ if width_fov > height_fov:
144
+ ratio = width_fov / height_fov
145
+ w = int((self.FOV / 360) * self.width)
146
+ h = int(w / ratio)
147
+ perspective_img = cv2.resize(
148
+ perspective_img, (w, h), interpolation=cv2.INTER_AREA)
149
+ else:
150
+ ratio = height_fov / width_fov
151
+ h = int((self.FOV / 180) * self.height)
152
+ w = int(h / ratio)
153
+ perspective_img = cv2.resize(
154
+ perspective_img, (w, h), interpolation=cv2.INTER_AREA)
155
+
156
+
157
+ equ = Perspective(perspective_img, self.FOV,
158
+ self.THETA, self.PHI, crop_bound=False)
159
+ img, mask = equ.GetEquirec(self.height, self.width)
160
+ # erode mask
161
+ mask = cv2.erode(mask.astype(np.uint8), np.ones(
162
+ (3, 3), np.uint8), iterations=5)
163
+
164
+ img = img * mask
165
+
166
+ mask = mask.astype(np.uint8) * 255
167
+ mask = 255 - mask
168
+
169
+ mask = Image.fromarray(mask[:, :, 0])
170
+ img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
171
+ img = Image.fromarray(img)
172
+
173
+ image = self.pipe(
174
+ prompt=prompt,
175
+ image=img,
176
+ mask_image=mask,
177
+ height=self.height,
178
+ width=self.width,
179
+ negative_prompt=negative_prompt,
180
+ guidance_scale=self.guidance_scale,
181
+ num_inference_steps=self.num_inference_steps,
182
+ generator=torch.Generator("cpu").manual_seed(seed),
183
+ blend_extend=self.blend_extend,
184
+ shifting_extend=self.shifting_extend,
185
+ true_cfg_scale=self.true_cfg_scale,
186
+ ).images[0]
187
+
188
+ image.save(os.path.join(output_path, 'panorama.png'))
189
+
190
+ return image
191
+
192
+
193
+ if __name__ == "__main__":
194
+ parser = argparse.ArgumentParser(description="Text/Image to Panorama Demo")
195
+ parser.add_argument("--prompt", type=str,
196
+ default="", help="Prompt for image generation")
197
+ parser.add_argument("--negative_prompt", type=str,
198
+ default="", help="Negative prompt for image generation")
199
+ parser.add_argument("--image_path", type=str,
200
+ default=None, help="Path to the input image")
201
+ parser.add_argument("--seed", type=int, default=42,
202
+ help="Random seed for reproducibility")
203
+ parser.add_argument("--output_path", type=str, default="results",
204
+ help="Path to save the output results")
205
+
206
+ args = parser.parse_args()
207
+
208
+ os.makedirs(args.output_path, exist_ok=True)
209
+ print(f"Output will be saved to: {args.output_path}")
210
+
211
+ if args.image_path is None:
212
+ print("No image path provided, using text-to-panorama generation.")
213
+ demo_T2P = Text2PanoramaDemo()
214
+ panorama_image = demo_T2P.run(
215
+ args.prompt, args.negative_prompt, args.seed, args.output_path)
216
+ else:
217
+ if not os.path.exists(args.image_path):
218
+ raise FileNotFoundError(
219
+ f"Image path {args.image_path} does not exist.")
220
+ print(f"Using image at {args.image_path} for panorama generation.")
221
+ demo_I2P = Image2PanoramaDemo()
222
+ panorama_image = demo_I2P.run(
223
+ args.prompt, args.negative_prompt, args.image_path, args.seed, args.output_path)
demo_scenegen.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+ import os
15
+ import torch
16
+ import open3d as o3d
17
+
18
+ import argparse
19
+
20
+ # hunyuan3d sence generation
21
+ from hy3dworld import LayerDecomposition
22
+ from hy3dworld import WorldComposer, process_file
23
+
24
+
25
+ class HYworldDemo:
26
+ def __init__(self, seed=42):
27
+
28
+ target_size = 3840
29
+ kernel_scale = max(1, int(target_size / 1920))
30
+
31
+ self.LayerDecomposer = LayerDecomposition()
32
+
33
+ self.hy3d_world = WorldComposer(
34
+ device=torch.device(
35
+ "cuda" if torch.cuda.is_available() else "cpu"),
36
+ resolution=(target_size, target_size // 2),
37
+ seed=seed,
38
+ filter_mask=True,
39
+ kernel_scale=kernel_scale,
40
+ )
41
+
42
+ def run(self, image_path, labels_fg1, labels_fg2, classes="outdoor", output_dir='output_hyworld', export_drc=False):
43
+ # foreground layer information
44
+ fg1_infos = [
45
+ {
46
+ "image_path": image_path,
47
+ "output_path": output_dir,
48
+ "labels": labels_fg1,
49
+ "class": classes,
50
+ }
51
+ ]
52
+ fg2_infos = [
53
+ {
54
+ "image_path": os.path.join(output_dir, 'remove_fg1_image.png'),
55
+ "output_path": output_dir,
56
+ "labels": labels_fg2,
57
+ "class": classes,
58
+ }
59
+ ]
60
+
61
+ # layer decompose
62
+ self.LayerDecomposer(fg1_infos, layer=0)
63
+ self.LayerDecomposer(fg2_infos, layer=1)
64
+ self.LayerDecomposer(fg2_infos, layer=2)
65
+ separate_pano, fg_bboxes = self.hy3d_world._load_separate_pano_from_dir(
66
+ output_dir, sr=True
67
+ )
68
+
69
+ # layer-wise reconstruction
70
+ layered_world_mesh = self.hy3d_world.generate_world(
71
+ separate_pano=separate_pano, fg_bboxes=fg_bboxes, world_type='mesh'
72
+ )
73
+
74
+ # save results
75
+ for layer_idx, layer_info in enumerate(layered_world_mesh):
76
+ # export ply
77
+ output_path = os.path.join(
78
+ output_dir, f"mesh_layer{layer_idx}.ply"
79
+ )
80
+ o3d.io.write_triangle_mesh(output_path, layer_info['mesh'])
81
+
82
+ # export drc
83
+ if export_drc:
84
+ output_path_drc = os.path.join(
85
+ output_dir, f"mesh_layer{layer_idx}.drc"
86
+ )
87
+ process_file(output_path, output_path_drc)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ parser = argparse.ArgumentParser(description="Hunyuan3D World Gen Demo")
92
+ parser.add_argument("--image_path", type=str,
93
+ default=None, help="Path to the Panorama image")
94
+ parser.add_argument("--labels_fg1", nargs='+', default=[],
95
+ help="Labels for foreground objects in layer 1")
96
+ parser.add_argument("--labels_fg2", nargs='+', default=[],
97
+ help="Labels for foreground objects in layer 2")
98
+ parser.add_argument("--classes", type=str, default="outdoor",
99
+ help="Classes for sence generation")
100
+ parser.add_argument("--seed", type=int, default=42,
101
+ help="Random seed for reproducibility")
102
+ parser.add_argument("--output_path", type=str, default="results",
103
+ help="Path to save the output results")
104
+ parser.add_argument("--export_drc", type=bool, default=False,
105
+ help="Whether to export Draco format")
106
+
107
+ args = parser.parse_args()
108
+
109
+ os.makedirs(args.output_path, exist_ok=True)
110
+ print(f"Output will be saved to: {args.output_path}")
111
+
112
+ demo_HYworld = HYworldDemo(seed=args.seed)
113
+ demo_HYworld.run(
114
+ image_path=args.image_path,
115
+ labels_fg1=args.labels_fg1,
116
+ labels_fg2=args.labels_fg2,
117
+ classes=args.classes,
118
+ output_dir=args.output_path,
119
+ export_drc=args.export_drc
120
+ )
docker/HunyuanWorld.osx-cpu.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: hunyuan_world
2
+ channels:
3
+ - pytorch
4
+ - conda-forge
5
+ - defaults
6
+ dependencies:
7
+ - python=3.10
8
+ - pytorch
9
+ - torchvision
10
+ - torchaudio
11
+ - numpy
12
+ - pillow
13
+ - pyyaml
14
+ - requests
15
+ - ffmpeg
16
+ - networkx
17
+ - pip
18
+ - pip:
19
+ - absl-py==2.2.2
20
+ - accelerate==1.6.0
21
+ - addict==2.4.0
22
+ - aiohappyeyeballs==2.6.1
23
+ - aiohttp==3.11.16
24
+ - aiosignal==1.3.2
25
+ - albumentations==0.5.2
26
+ - antlr4-python3-runtime==4.8
27
+ - asttokens==3.0.0
28
+ - async-timeout==5.0.1
29
+ - attrs==25.3.0
30
+ - av==14.3.0
31
+ - braceexpand==0.1.7
32
+ - cloudpickle==3.1.1
33
+ - colorama==0.4.6
34
+ - coloredlogs==15.0.1
35
+ - contourpy==1.3.2
36
+ - cycler==0.12.1
37
+ - cython==3.0.11
38
+ - eva-decord==0.6.1
39
+ - diffdist==0.1
40
+ - diffusers==0.32.0
41
+ - easydict==1.9
42
+ - einops==0.4.1
43
+ - executing==2.2.0
44
+ - facexlib==0.3.0
45
+ - filterpy==1.4.5
46
+ - flatbuffers==25.2.10
47
+ - fonttools==4.57.0
48
+ - frozenlist==1.6.0
49
+ - fsspec==2025.3.2
50
+ - ftfy==6.1.1
51
+ - future==1.0.0
52
+ - gfpgan==1.3.8
53
+ - grpcio==1.71.0
54
+ - h5py==3.7.0
55
+ - huggingface-hub==0.30.2
56
+ - humanfriendly==10.0
57
+ - hydra-core==1.1.0
58
+ - icecream==2.1.2
59
+ - imageio==2.37.0
60
+ - imageio-ffmpeg==0.4.9
61
+ - imgaug==0.4.0
62
+ - importlib-metadata==8.6.1
63
+ - inflect==5.6.0
64
+ - joblib==1.4.2
65
+ - kiwisolver==1.4.8
66
+ - kornia==0.8.0
67
+ - kornia-rs==0.1.8
68
+ - lazy-loader==0.4
69
+ - lightning-utilities==0.14.3
70
+ - llvmlite==0.44.0
71
+ - lmdb==1.6.2
72
+ - loguru==0.7.3
73
+ - markdown==3.8
74
+ - markdown-it-py==3.0.0
75
+ - matplotlib==3.10.1
76
+ - mdurl==0.1.2
77
+ - multidict==6.4.3
78
+ - natten==0.14.4
79
+ - numba==0.61.2
80
+ - omegaconf==2.1.2
81
+ - onnx==1.17.0
82
+ - onnxruntime==1.21.1
83
+ - open-clip-torch==2.30.0
84
+ - opencv-python==4.11.0.86
85
+ - opencv-python-headless==4.11.0.86
86
+ - packaging==24.2
87
+ - pandas==2.2.3
88
+ - peft==0.14.0
89
+ - platformdirs==4.3.7
90
+ - plyfile==1.1
91
+ - propcache==0.3.1
92
+ - protobuf==5.29.3
93
+ - psutil==7.0.0
94
+ - py-cpuinfo==9.0.0
95
+ - py360convert==1.0.3
96
+ - pygments==2.19.1
97
+ - pyparsing==3.2.3
98
+ - python-dateutil==2.9.0.post0
99
+ - pytorch-lightning==2.4.0
100
+ - pytz==2025.2
101
+ - qwen-vl-utils==0.0.8
102
+ - regex==2022.6.2
103
+ - rich==14.0.0
104
+ - safetensors==0.5.3
105
+ - scikit-image==0.24.0
106
+ - scikit-learn==1.6.1
107
+ - scipy==1.15.2
108
+ - seaborn==0.13.2
109
+ - segment-anything==1.0
110
+ - sentencepiece==0.2.0
111
+ - setuptools==59.5.0
112
+ - shapely==2.0.7
113
+ - six==1.17.0
114
+ - submitit==1.4.2
115
+ - sympy==1.13.1
116
+ - tabulate==0.9.0
117
+ - tb-nightly==2.20.0a20250421
118
+ - tensorboard-data-server==0.7.2
119
+ - termcolor==3.0.1
120
+ - threadpoolctl==3.6.0
121
+ - tifffile==2025.3.30
122
+ - timm==1.0.13
123
+ - tokenizers==0.21.1
124
+ - tomli==2.2.1
125
+ - torchmetrics==1.7.1
126
+ - tqdm==4.67.1
127
+ - transformers==4.51.0
128
+ - tzdata==2025.2
129
+ - ultralytics==8.3.74
130
+ - ultralytics-thop==2.0.14
131
+ - wcwidth==0.2.13
132
+ - webdataset==0.2.100
133
+ - werkzeug==3.1.3
134
+ - wldhx-yadisk-direct==0.0.6
135
+ - yapf==0.43.0
136
+ - yarl==1.20.0
137
+ - zipp==3.21.0
138
+ - open3d>=0.18.0
139
+ - trimesh>=4.6.1
140
+ - cmake
141
+ - pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
142
+ - moge @ git+https://github.com/microsoft/MoGe.git
docker/HunyuanWorld.osx64.yaml ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: HunyuanWorld
2
+ channels:
3
+ - conda-forge
4
+ - pytorch
5
+ - nvidia
6
+ - defaults
7
+ - https://repo.anaconda.com/pkgs/main
8
+ - https://repo.anaconda.com/pkgs/r
9
+ dependencies:
10
+ - _libgcc_mutex=0.1
11
+ - _openmp_mutex=5.1
12
+ - blas=1.0
13
+ - brotli-python=1.0.9
14
+ - bzip2=1.0.8
15
+ - ca-certificates=2025.2.25
16
+ - certifi=2025.1.31
17
+ - charset-normalizer=3.3.2
18
+ - cuda-cudart=12.4.127
19
+ - cuda-cupti=12.4.127
20
+ - cuda-libraries=12.4.1
21
+ - cuda-nvrtc=12.4.127
22
+ - cuda-nvtx=12.4.127
23
+ - cuda-opencl=12.8.90
24
+ - cuda-runtime=12.4.1
25
+ - cuda-version=12.8
26
+ - ffmpeg=4.3
27
+ - filelock=3.17.0
28
+ - freetype=2.13.3
29
+ - giflib=5.2.2
30
+ - gmp=6.3.0
31
+ - gmpy2=2.2.1
32
+ - gnutls=3.6.15
33
+ - idna=3.7
34
+ - intel-openmp=2023.1.0
35
+ - jinja2=3.1.6
36
+ - jpeg=9e
37
+ - lame=3.100
38
+ - lcms2=2.16
39
+ - ld_impl_linux-64=2.40
40
+ - lerc=4.0.0
41
+ - libcublas=12.4.5.8
42
+ - libcufft=11.2.1.3
43
+ - libcufile=1.13.1.3
44
+ - libcurand=10.3.9.90
45
+ - libcusolver=11.6.1.9
46
+ - libcusparse=12.3.1.170
47
+ - libdeflate=1.22
48
+ - libffi=3.4.4
49
+ - libgcc-ng=11.2.0
50
+ - libgomp=11.2.0
51
+ - libiconv=1.16
52
+ - libidn2=2.3.4
53
+ - libjpeg-turbo=2.0.0
54
+ - libnpp=12.2.5.30
55
+ - libnvfatbin=12.8.90
56
+ - libnvjitlink=12.4.127
57
+ - libnvjpeg=12.3.1.117
58
+ - libpng=1.6.39
59
+ - libstdcxx-ng=11.2.0
60
+ - libtasn1=4.19.0
61
+ - libtiff=4.7.0
62
+ - libunistring=0.9.10
63
+ - libuuid=1.41.5
64
+ - libwebp=1.3.2
65
+ - libwebp-base=1.3.2
66
+ - llvm-openmp=14.0.6
67
+ - lz4-c=1.9.4
68
+ - markupsafe=3.0.2
69
+ - mkl=2023.1.0
70
+ - mkl-service=2.4.0
71
+ - mkl_fft=1.3.11
72
+ - mkl_random=1.2.8
73
+ - mpc=1.3.1
74
+ - mpfr=4.2.1
75
+ - mpmath=1.3.0
76
+ - ncurses=6.4
77
+ - nettle=3.7.3
78
+ - networkx=3.4.2
79
+ - ocl-icd=2.3.2
80
+ - openh264=2.1.1
81
+ - openjpeg=2.5.2
82
+ - openssl=3.0.16
83
+ - pillow=11.1.0
84
+ - pip=25.0
85
+ - pysocks=1.7.1
86
+ - python=3.10.16
87
+ - pytorch=2.5.0
88
+ - pytorch-cuda=12.4
89
+ - pytorch-mutex=1.0
90
+ - pyyaml=6.0.2
91
+ - readline=8.2
92
+ - requests=2.32.3
93
+ - sqlite=3.45.3
94
+ - tbb=2021.8.0
95
+ - tk=8.6.14
96
+ - torchaudio=2.5.0
97
+ - torchvision=0.20.0
98
+ - typing_extensions=4.12.2
99
+ - urllib3=2.3.0
100
+ - wheel=0.45.1
101
+ - xz=5.6.4
102
+ - yaml=0.2.5
103
+ - zlib=1.2.13
104
+ - zstd=1.5.6
105
+ - pip:
106
+ - absl-py=
107
+ - accelerate=
108
+ - addict=
109
+ - aiohappyeyeballs=
110
+ - aiohttp=
111
+ - aiosignal=
112
+ - albumentations=
113
+ - antlr4-python3-runtime=
114
+ - asttokens=
115
+ - async-timeout=
116
+ - attrs=
117
+ - av=
118
+ - braceexpand=
119
+ - cloudpickle=
120
+ - colorama=
121
+ - coloredlogs=
122
+ - contourpy=
123
+ - cycler=
124
+ - cython=
125
+ - decord=
126
+ - diffdist=
127
+ - diffusers=
128
+ - easydict=
129
+ - einops=
130
+ - executing=
131
+ - facexlib=
132
+ - filterpy=
133
+ - flash-attn=
134
+ - flatbuffers=
135
+ - fonttools=
136
+ - frozenlist=
137
+ - fsspec=
138
+ - ftfy=
139
+ - future=
140
+ - gfpgan=
141
+ - grpcio=
142
+ - h5py=
143
+ - huggingface-hub=
144
+ - humanfriendly=
145
+ - hydra-core=
146
+ - icecream=
147
+ - imageio=
148
+ - imageio-ffmpeg=
149
+ - imgaug=
150
+ - importlib-metadata=
151
+ - inflect=
152
+ - joblib=
153
+ - kiwisolver=
154
+ - kornia=
155
+ - kornia-rs=
156
+ - lazy-loader=
157
+ - lightning-utilities=
158
+ - llvmlite=
159
+ - lmdb=
160
+ - loguru=
161
+ - markdown=
162
+ - markdown-it-py=
163
+ - matplotlib=
164
+ - mdurl=
165
+ - multidict=
166
+ - natten=
167
+ - numba=
168
+ - numpy=
169
+ - nvidia-cublas-cu12=
170
+ - nvidia-cuda-cupti-cu12=
171
+ - nvidia-cuda-nvrtc-cu12=
172
+ - nvidia-cuda-runtime-cu12=
173
+ - nvidia-cudnn-cu12=
174
+ - nvidia-cufft-cu12=
175
+ - nvidia-curand-cu12=
176
+ - nvidia-cusolver-cu12=
177
+ - nvidia-cusparse-cu12=
178
+ - nvidia-cusparselt-cu12=
179
+ - nvidia-nccl-cu12=
180
+ - nvidia-nvjitlink-cu12=
181
+ - nvidia-nvtx-cu12=
182
+ - omegaconf=
183
+ - onnx=
184
+ - onnxruntime-gpu=
185
+ - open-clip-torch=
186
+ - opencv-python=
187
+ - opencv-python-headless=
188
+ - packaging=
189
+ - pandas=
190
+ - peft=
191
+ - platformdirs=
192
+ - plyfile=
193
+ - propcache=
194
+ - protobuf=
195
+ - psutil=
196
+ - py-cpuinfo=
197
+ - py360convert=
198
+ - pygments=
199
+ - pyparsing=
200
+ - python-dateutil=
201
+ - pytorch-lightning=
202
+ - pytz=
203
+ - qwen-vl-utils=
204
+ - regex=
205
+ - rich=
206
+ - safetensors=
207
+ - scikit-image=
208
+ - scikit-learn=
209
+ - scipy=
210
+ - seaborn=
211
+ - segment-anything=
212
+ - sentencepiece=
213
+ - setuptools=
214
+ - shapely=
215
+ - six=
216
+ - submitit=
217
+ - sympy=
218
+ - tabulate=
219
+ - tb-nightly=
220
+ - tensorboard-data-server=
221
+ - termcolor=
222
+ - threadpoolctl=
223
+ - tifffile=
224
+ - timm=
225
+ - tokenizers=
226
+ - tomli=
227
+ - torchmetrics=
228
+ - tqdm=
229
+ - transformers=
230
+ - triton=
231
+ - tzdata=
232
+ - ultralytics=
233
+ - ultralytics-thop=
234
+ - wcwidth=
235
+ - webdataset=
236
+ - werkzeug=
237
+ - wldhx-yadisk-direct=
238
+ - xformers=
239
+ - yapf=
240
+ - yarl=
241
+ - zipp=
242
+ - open3d>=0.18.0
243
+ - trimesh>=4.6.1
244
+ - cmake
245
+ - pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
246
+ - moge @ git+https://github.com/microsoft/MoGe.git
247
+ prefix: /opt/conda/envs/HunyuanWorld
docker/HunyuanWorld.yaml ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: HunyuanWorld
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - blas=1.0=mkl
12
+ - brotli-python=1.0.9=py310h6a678d5_9
13
+ - bzip2=1.0.8=h5eee18b_6
14
+ - ca-certificates=2025.2.25=h06a4308_0
15
+ - certifi=2025.1.31=py310h06a4308_0
16
+ - charset-normalizer=3.3.2=pyhd3eb1b0_0
17
+ - cuda-cudart=12.4.127=0
18
+ - cuda-cupti=12.4.127=0
19
+ - cuda-libraries=12.4.1=0
20
+ - cuda-nvrtc=12.4.127=0
21
+ - cuda-nvtx=12.4.127=0
22
+ - cuda-opencl=12.8.90=0
23
+ - cuda-runtime=12.4.1=0
24
+ - cuda-version=12.8=3
25
+ - ffmpeg=4.3=hf484d3e_0
26
+ - filelock=3.17.0=py310h06a4308_0
27
+ - freetype=2.13.3=h4a9f257_0
28
+ - giflib=5.2.2=h5eee18b_0
29
+ - gmp=6.3.0=h6a678d5_0
30
+ - gmpy2=2.2.1=py310h5eee18b_0
31
+ - gnutls=3.6.15=he1e5248_0
32
+ - idna=3.7=py310h06a4308_0
33
+ - intel-openmp=2023.1.0=hdb19cb5_46306
34
+ - jinja2=3.1.6=py310h06a4308_0
35
+ - jpeg=9e=h5eee18b_3
36
+ - lame=3.100=h7b6447c_0
37
+ - lcms2=2.16=h92b89f2_1
38
+ - ld_impl_linux-64=2.40=h12ee557_0
39
+ - lerc=4.0.0=h6a678d5_0
40
+ - libcublas=12.4.5.8=0
41
+ - libcufft=11.2.1.3=0
42
+ - libcufile=1.13.1.3=0
43
+ - libcurand=10.3.9.90=0
44
+ - libcusolver=11.6.1.9=0
45
+ - libcusparse=12.3.1.170=0
46
+ - libdeflate=1.22=h5eee18b_0
47
+ - libffi=3.4.4=h6a678d5_1
48
+ - libgcc-ng=11.2.0=h1234567_1
49
+ - libgomp=11.2.0=h1234567_1
50
+ - libiconv=1.16=h5eee18b_3
51
+ - libidn2=2.3.4=h5eee18b_0
52
+ - libjpeg-turbo=2.0.0=h9bf148f_0
53
+ - libnpp=12.2.5.30=0
54
+ - libnvfatbin=12.8.90=0
55
+ - libnvjitlink=12.4.127=0
56
+ - libnvjpeg=12.3.1.117=0
57
+ - libpng=1.6.39=h5eee18b_0
58
+ - libstdcxx-ng=11.2.0=h1234567_1
59
+ - libtasn1=4.19.0=h5eee18b_0
60
+ - libtiff=4.7.0=hde9077f_0
61
+ - libunistring=0.9.10=h27cfd23_0
62
+ - libuuid=1.41.5=h5eee18b_0
63
+ - libwebp=1.3.2=h9f374a3_1
64
+ - libwebp-base=1.3.2=h5eee18b_1
65
+ - llvm-openmp=14.0.6=h9e868ea_0
66
+ - lz4-c=1.9.4=h6a678d5_1
67
+ - markupsafe=3.0.2=py310h5eee18b_0
68
+ - mkl=2023.1.0=h213fc3f_46344
69
+ - mkl-service=2.4.0=py310h5eee18b_2
70
+ - mkl_fft=1.3.11=py310h5eee18b_0
71
+ - mkl_random=1.2.8=py310h1128e8f_0
72
+ - mpc=1.3.1=h5eee18b_0
73
+ - mpfr=4.2.1=h5eee18b_0
74
+ - mpmath=1.3.0=py310h06a4308_0
75
+ - ncurses=6.4=h6a678d5_0
76
+ - nettle=3.7.3=hbbd107a_1
77
+ - networkx=3.4.2=py310h06a4308_0
78
+ - ocl-icd=2.3.2=h5eee18b_1
79
+ - openh264=2.1.1=h4ff587b_0
80
+ - openjpeg=2.5.2=h0d4d230_1
81
+ - openssl=3.0.16=h5eee18b_0
82
+ - pillow=11.1.0=py310hac6e08b_1
83
+ - pip=25.0=py310h06a4308_0
84
+ - pysocks=1.7.1=py310h06a4308_0
85
+ - python=3.10.16=he870216_1
86
+ - pytorch=2.5.0=py3.10_cuda12.4_cudnn9.1.0_0
87
+ - pytorch-cuda=12.4=hc786d27_7
88
+ - pytorch-mutex=1.0=cuda
89
+ - pyyaml=6.0.2=py310h5eee18b_0
90
+ - readline=8.2=h5eee18b_0
91
+ - requests=2.32.3=py310h06a4308_1
92
+ - sqlite=3.45.3=h5eee18b_0
93
+ - tbb=2021.8.0=hdb19cb5_0
94
+ - tk=8.6.14=h39e8969_0
95
+ - torchaudio=2.5.0=py310_cu124
96
+ - torchvision=0.20.0=py310_cu124
97
+ - typing_extensions=4.12.2=py310h06a4308_0
98
+ - urllib3=2.3.0=py310h06a4308_0
99
+ - wheel=0.45.1=py310h06a4308_0
100
+ - xz=5.6.4=h5eee18b_1
101
+ - yaml=0.2.5=h7b6447c_0
102
+ - zlib=1.2.13=h5eee18b_1
103
+ - zstd=1.5.6=hc292b87_0
104
+ - pip:
105
+ - absl-py==2.2.2
106
+ - accelerate==1.6.0
107
+ - addict==2.4.0
108
+ - aiohappyeyeballs==2.6.1
109
+ - aiohttp==3.11.16
110
+ - aiosignal==1.3.2
111
+ - albumentations==0.5.2
112
+ - antlr4-python3-runtime==4.8
113
+ - asttokens==3.0.0
114
+ - async-timeout==5.0.1
115
+ - attrs==25.3.0
116
+ - av==14.3.0
117
+ - braceexpand==0.1.7
118
+ - cloudpickle==3.1.1
119
+ - colorama==0.4.6
120
+ - coloredlogs==15.0.1
121
+ - contourpy==1.3.2
122
+ - cycler==0.12.1
123
+ - cython==3.0.11
124
+ - decord==0.6.0
125
+ - diffdist==0.1
126
+ - diffusers==0.32.0
127
+ - easydict==1.9
128
+ - einops==0.4.1
129
+ - executing==2.2.0
130
+ - facexlib==0.3.0
131
+ - filterpy==1.4.5
132
+ - flash-attn==2.7.4.post1
133
+ - flatbuffers==25.2.10
134
+ - fonttools==4.57.0
135
+ - frozenlist==1.6.0
136
+ - fsspec==2025.3.2
137
+ - ftfy==6.1.1
138
+ - future==1.0.0
139
+ - gfpgan==1.3.8
140
+ - grpcio==1.71.0
141
+ - h5py==3.7.0
142
+ - huggingface-hub==0.30.2
143
+ - humanfriendly==10.0
144
+ - hydra-core==1.1.0
145
+ - icecream==2.1.2
146
+ - imageio==2.37.0
147
+ - imageio-ffmpeg==0.4.9
148
+ - imgaug==0.4.0
149
+ - importlib-metadata==8.6.1
150
+ - inflect==5.6.0
151
+ - joblib==1.4.2
152
+ - kiwisolver==1.4.8
153
+ - kornia==0.8.0
154
+ - kornia-rs==0.1.8
155
+ - lazy-loader==0.4
156
+ - lightning-utilities==0.14.3
157
+ - llvmlite==0.44.0
158
+ - lmdb==1.6.2
159
+ - loguru==0.7.3
160
+ - markdown==3.8
161
+ - markdown-it-py==3.0.0
162
+ - matplotlib==3.10.1
163
+ - mdurl==0.1.2
164
+ - multidict==6.4.3
165
+ - natten==0.14.4
166
+ - numba==0.61.2
167
+ - numpy==1.24.1
168
+ - nvidia-cublas-cu12==12.4.5.8
169
+ - nvidia-cuda-cupti-cu12==12.4.127
170
+ - nvidia-cuda-nvrtc-cu12==12.4.127
171
+ - nvidia-cuda-runtime-cu12==12.4.127
172
+ - nvidia-cudnn-cu12==9.1.0.70
173
+ - nvidia-cufft-cu12==11.2.1.3
174
+ - nvidia-curand-cu12==10.3.5.147
175
+ - nvidia-cusolver-cu12==11.6.1.9
176
+ - nvidia-cusparse-cu12==12.3.1.170
177
+ - nvidia-cusparselt-cu12==0.6.2
178
+ - nvidia-nccl-cu12==2.21.5
179
+ - nvidia-nvjitlink-cu12==12.4.127
180
+ - nvidia-nvtx-cu12==12.4.127
181
+ - omegaconf==2.1.2
182
+ - onnx==1.17.0
183
+ - onnxruntime-gpu==1.21.1
184
+ - open-clip-torch==2.30.0
185
+ - opencv-python==4.11.0.86
186
+ - opencv-python-headless==4.11.0.86
187
+ - packaging==24.2
188
+ - pandas==2.2.3
189
+ - peft==0.14.0
190
+ - platformdirs==4.3.7
191
+ - plyfile==1.1
192
+ - propcache==0.3.1
193
+ - protobuf==5.29.3
194
+ - psutil==7.0.0
195
+ - py-cpuinfo==9.0.0
196
+ - py360convert==1.0.3
197
+ - pygments==2.19.1
198
+ - pyparsing==3.2.3
199
+ - python-dateutil==2.9.0.post0
200
+ - pytorch-lightning==2.4.0
201
+ - pytz==2025.2
202
+ - qwen-vl-utils==0.0.8
203
+ - regex==2022.6.2
204
+ - rich==14.0.0
205
+ - safetensors==0.5.3
206
+ - scikit-image==0.24.0
207
+ - scikit-learn==1.6.1
208
+ - scipy==1.15.2
209
+ - seaborn==0.13.2
210
+ - segment-anything==1.0
211
+ - sentencepiece==0.2.0
212
+ - setuptools==59.5.0
213
+ - shapely==2.0.7
214
+ - six==1.17.0
215
+ - submitit==1.4.2
216
+ - sympy==1.13.1
217
+ - tabulate==0.9.0
218
+ - tb-nightly==2.20.0a20250421
219
+ - tensorboard-data-server==0.7.2
220
+ - termcolor==3.0.1
221
+ - threadpoolctl==3.6.0
222
+ - tifffile==2025.3.30
223
+ - timm==1.0.13
224
+ - tokenizers==0.21.1
225
+ - tomli==2.2.1
226
+ - torchmetrics==1.7.1
227
+ - tqdm==4.67.1
228
+ - transformers==4.51.0
229
+ - triton==3.2.0
230
+ - tzdata==2025.2
231
+ - ultralytics==8.3.74
232
+ - ultralytics-thop==2.0.14
233
+ - wcwidth==0.2.13
234
+ - webdataset==0.2.100
235
+ - werkzeug==3.1.3
236
+ - wldhx-yadisk-direct==0.0.6
237
+ - xformers==0.0.28.post2
238
+ - yapf==0.43.0
239
+ - yarl==1.20.0
240
+ - zipp==3.21.0
241
+ - open3d>=0.18.0
242
+ - trimesh>=4.6.1
243
+ - cmake
244
+ - pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
245
+ - moge @ git+https://github.com/microsoft/MoGe.git
246
+ prefix: /opt/conda/envs/HunyuanWorld
docker/HunyuanWorld_mac.yaml ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: HunyuanWorld-mac
2
+ channels:
3
+ - pytorch
4
+ - conda-forge
5
+ - defaults
6
+ dependencies:
7
+ - python=3.10
8
+ - pytorch
9
+ - torchvision
10
+ - torchaudio
11
+ - ffmpeg
12
+ - filelock
13
+ - freetype
14
+ - gmp
15
+ - gmpy2
16
+ - gnutls
17
+ - idna
18
+ - jinja2
19
+ - jpeg
20
+ - lame
21
+ - lcms2
22
+ - lerc
23
+ - libdeflate
24
+ - libffi
25
+ - libiconv
26
+ - libidn2
27
+ - libpng
28
+ - libtasn1
29
+ - libtiff
30
+ - libunistring
31
+ - libuuid
32
+ - libwebp
33
+ - llvm-openmp
34
+ - lz4-c
35
+ - markupsafe
36
+ - mpc
37
+ - mpfr
38
+ - mpmath
39
+ - ncurses
40
+ - nettle
41
+ - networkx
42
+ - openh264
43
+ - openjpeg
44
+ - openssl
45
+ - pillow
46
+ - pip
47
+ - pysocks
48
+ - pyyaml
49
+ - readline
50
+ - requests
51
+ - sqlite
52
+ - tbb
53
+ - tk
54
+ - typing_extensions
55
+ - urllib3
56
+ - wheel
57
+ - xz
58
+ - yaml
59
+ - zlib
60
+ - zstd
61
+ - pip:
62
+ - absl-py==2.2.2
63
+ - accelerate==1.6.0
64
+ - addict==2.4.0
65
+ - aiohappyeyeballs==2.6.1
66
+ - aiohttp==3.11.16
67
+ - aiosignal==1.3.2
68
+ - albumentations==0.5.2
69
+ - antlr4-python3-runtime==4.8
70
+ - asttokens==3.0.0
71
+ - async-timeout==5.0.1
72
+ - attrs==25.3.0
73
+ - av==14.3.0
74
+ - braceexpand==0.1.7
75
+ - cloudpickle==3.1.1
76
+ - colorama==0.4.6
77
+ - coloredlogs==15.0.1
78
+ - contourpy==1.3.2
79
+ - cycler==0.12.1
80
+ - cython==3.0.11
81
+ - decord==0.6.0
82
+ - diffdist==0.1
83
+ - diffusers==0.32.0
84
+ - easydict==1.9
85
+ - einops==0.4.1
86
+ - executing==2.2.0
87
+ - facexlib==0.3.0
88
+ - filterpy==1.4.5
89
+ - flatbuffers==25.2.10
90
+ - fonttools==4.57.0
91
+ - frozenlist==1.6.0
92
+ - fsspec==2025.3.2
93
+ - ftfy==6.1.1
94
+ - future==1.0.0
95
+ - gfpgan==1.3.8
96
+ - grpcio==1.71.0
97
+ - h5py==3.7.0
98
+ - huggingface-hub==0.30.2
99
+ - humanfriendly==10.0
100
+ - hydra-core==1.1.0
101
+ - icecream==2.1.2
102
+ - imageio==2.37.0
103
+ - imageio-ffmpeg==0.4.9
104
+ - imgaug==0.4.0
105
+ - importlib-metadata==8.6.1
106
+ - inflect==5.6.0
107
+ - joblib==1.4.2
108
+ - kiwisolver==1.4.8
109
+ - kornia==0.8.0
110
+ - kornia-rs==0.1.8
111
+ - lazy-loader==0.4
112
+ - lightning-utilities==0.14.3
113
+ - llvmlite==0.44.0
114
+ - lmdb==1.6.2
115
+ - loguru==0.7.3
116
+ - markdown==3.8
117
+ - markdown-it-py==3.0.0
118
+ - matplotlib==3.10.1
119
+ - mdurl==0.1.2
120
+ - multidict==6.4.3
121
+ - natten==0.14.4
122
+ - numba==0.61.2
123
+ - numpy==1.24.1
124
+ - omegaconf==2.1.2
125
+ - onnx==1.17.0
126
+ - onnxruntime
127
+ - open-clip-torch==2.30.0
128
+ - opencv-python==4.11.0.86
129
+ - opencv-python-headless==4.11.0.86
130
+ - packaging==24.2
131
+ - pandas==2.2.3
132
+ - peft==0.14.0
133
+ - platformdirs==4.3.7
134
+ - plyfile==1.1
135
+ - propcache==0.3.1
136
+ - protobuf==5.29.3
137
+ - psutil==7.0.0
138
+ - py-cpuinfo==9.0.0
139
+ - py360convert==1.0.3
140
+ - pygments==2.19.1
141
+ - pyparsing==3.2.3
142
+ - python-dateutil==2.9.0.post0
143
+ - pytorch-lightning==2.4.0
144
+ - pytz==2025.2
145
+ - qwen-vl-utils==0.0.8
146
+ - regex==2022.6.2
147
+ - rich==14.0.0
148
+ - safetensors==0.5.3
149
+ - scikit-image==0.24.0
150
+ - scikit-learn==1.6.1
151
+ - scipy==1.15.2
152
+ - seaborn==0.13.2
153
+ - segment-anything==1.0
154
+ - sentencepiece==0.2.0
155
+ - setuptools==59.5.0
156
+ - shapely==2.0.7
157
+ - six==1.17.0
158
+ - submitit==1.4.2
159
+ - sympy==1.13.1
160
+ - tabulate==0.9.0
161
+ - tb-nightly==2.20.0a20250421
162
+ - tensorboard-data-server==0.7.2
163
+ - termcolor==3.0.1
164
+ - threadpoolctl==3.6.0
165
+ - tifffile==2025.3.30
166
+ - timm==1.0.13
167
+ - tokenizers==0.21.1
168
+ - tomli==2.2.1
169
+ - torchmetrics==1.7.1
170
+ - tqdm==4.67.1
171
+ - transformers==4.51.0
172
+ - tzdata==2025.2
173
+ - ultralytics==8.3.74
174
+ - ultralytics-thop==2.0.14
175
+ - wcwidth==0.2.13
176
+ - webdataset==0.2.100
177
+ - werkzeug==3.1.3
178
+ - wldhx-yadisk-direct==0.0.6
179
+ - yapf==0.43.0
180
+ - yarl==1.20.0
181
+ - zipp==3.21.0
182
+ - open3d>=0.18.0
183
+ - trimesh>=4.6.1
184
+ - cmake
185
+ - pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
186
+ - moge @ git+https://github.com/microsoft/MoGe.git
examples/case1/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case1/input.png ADDED

Git LFS Details

  • SHA256: bb701fd458a87beccdd821f557994db64ff8eba7f78f426cb350ed70bbf83f14
  • Pointer size: 132 Bytes
  • Size of remote file: 3.84 MB
examples/case2/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case2/input.png ADDED

Git LFS Details

  • SHA256: fd9115e2db03b1232b4727555a70445e96bafc7a84b2347950156da704c90edb
  • Pointer size: 132 Bytes
  • Size of remote file: 2.68 MB
examples/case2/labels_fg1.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ stones
examples/case2/labels_fg2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ trees
examples/case3/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case3/input.png ADDED

Git LFS Details

  • SHA256: 55e19cd8173aef7f544dc2d487f0878ab6cffb8faf14121abe085bcd1ecbc888
  • Pointer size: 132 Bytes
  • Size of remote file: 3.32 MB
examples/case4/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case4/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ There is a rocky island on the vast sea surface, with a triangular rock burning red flames in the center of the island. The sea is open and rough, with a green surface. Surrounded by towering peaks in the distance.
examples/case5/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case5/input.png ADDED

Git LFS Details

  • SHA256: 3fe08152fbb72b8845348564bd59fd43515d1291918a0f665cd2a4cca479344f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.95 MB
examples/case6/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case6/input.png ADDED

Git LFS Details

  • SHA256: a2ace3cd5c3b3b5a8d3e5d3bfc809bd20487895bffdf739389efc0c520b219f7
  • Pointer size: 132 Bytes
  • Size of remote file: 1.55 MB
examples/case6/labels_fg1.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tent
examples/case7/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case7/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary
examples/case8/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case8/input.png ADDED

Git LFS Details

  • SHA256: ae6e9af5cf30d64bb0d0e19e6c7e4993126e8ca74191c4442948f13d2ceea755
  • Pointer size: 132 Bytes
  • Size of remote file: 2.03 MB
examples/case9/classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ outdoor
examples/case9/prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A breathtaking volcanic eruption scene. In the center of the screen, one or more volcanoes are erupting violently, with hot orange red lava gushing out from the crater, illuminating the surrounding night sky and landscape. Thick smoke and volcanic ash rose into the sky, forming a huge mushroom cloud like structure. Some of the smoke and dust were reflected in a dark red color by the high temperature of the lava, creating a doomsday atmosphere. In the foreground, a winding lava flow flows through the dark and rough rocks like a fire snake, emitting a dazzling light as if burning the earth. The steep and rugged mountains in the background further emphasize the ferocity and irresistible power of nature. The entire picture has a strong contrast of light and shadow, with red, black, and gray as the main colors, highlighting the visual impact and dramatic tension of volcanic eruptions, making people feel the grandeur and terror of nature.
hy3dworld/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+ # Image to Panorama
15
+ from .models import Image2PanoramaPipelines
16
+ from .utils import Perspective
17
+ # Text to Panorama
18
+ from .models import Text2PanoramaPipelines
19
+ # Sence Generation
20
+ from .models import LayerDecomposition
21
+ from .models import WorldComposer
22
+ from .utils import process_file
hy3dworld/models/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+
15
+ # Image to Panorama
16
+ from .pano_generator import Image2PanoramaPipelines
17
+ # Text to Panorama
18
+ from .pano_generator import Text2PanoramaPipelines
19
+
20
+ # Scene Generation
21
+ from .pipelines import FluxPipeline, FluxFillPipeline
22
+ from .layer_decomposer import LayerDecomposition
23
+ from .world_composer import WorldComposer
24
+
25
+ __all__ = [
26
+ "Image2PanoramaPipelines", "Text2PanoramaPipelines",
27
+ "FluxPipeline", "FluxFillPipeline",
28
+ "LayerDecomposition", "WorldComposer",
29
+ ]
hy3dworld/models/adaptive_depth_compression.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+ import torch
15
+ from typing import List, Dict, Tuple
16
+
17
+
18
+ class AdaptiveDepthCompressor:
19
+ r"""
20
+ Adaptive depth compressor to solve the problem of excessive background depth variance
21
+ in 3D world generation. This class provides methods to compress background and foreground
22
+ depth values based on statistical analysis of depth distributions, with options for
23
+ smooth compression and outlier removal.
24
+ Args:
25
+ cv_thresholds: Tuple of (low, high) thresholds for coefficient of variation (CV).
26
+ compression_quantiles: Tuple of (low, medium, high) quantiles for depth compression.
27
+ fg_bg_depth_margin: Margin factor to ensure foreground depth is greater than background.
28
+ enable_smooth_compression: Whether to use smooth compression instead of hard truncation.
29
+ outlier_removal_method: Method for outlier removal, options are "iqr", "quantile", or "none".
30
+ min_compression_depth: Minimum depth threshold for compression to be applied.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ cv_thresholds: Tuple[float, float] = (0.3, 0.8),
36
+ compression_quantiles: Tuple[float, float, float] = (0.95, 0.92, 0.85),
37
+ fg_bg_depth_margin: float = 1.1,
38
+ enable_smooth_compression: bool = True,
39
+ outlier_removal_method: str = "iqr",
40
+ min_compression_depth: float = 6.0,
41
+ ):
42
+ self.cv_thresholds = cv_thresholds
43
+ self.compression_quantiles = compression_quantiles
44
+ self.fg_bg_depth_margin = fg_bg_depth_margin
45
+ self.enable_smooth_compression = enable_smooth_compression
46
+ self.outlier_removal_method = outlier_removal_method
47
+ self.min_compression_depth = min_compression_depth
48
+
49
+ def _remove_outliers(self, depth_vals: torch.Tensor) -> torch.Tensor:
50
+ r"""
51
+ Remove outliers from depth values
52
+ based on the specified method (IQR or quantile).
53
+ Args:
54
+ depth_vals: Tensor of depth values to process.
55
+ Returns:
56
+ Tensor of depth values with outliers removed.
57
+ """
58
+ if self.outlier_removal_method == "iqr":
59
+ q25, q75 = torch.quantile(depth_vals, torch.tensor(
60
+ [0.25, 0.75], device=depth_vals.device))
61
+ iqr = q75 - q25
62
+ lower_bound, upper_bound = q25 - 1.5 * iqr, q75 + 1.5 * iqr
63
+ valid_mask = (depth_vals >= lower_bound) & (
64
+ depth_vals <= upper_bound)
65
+ elif self.outlier_removal_method == "quantile":
66
+ q05, q95 = torch.quantile(depth_vals, torch.tensor(
67
+ [0.05, 0.95], device=depth_vals.device))
68
+ valid_mask = (depth_vals >= q05) & (depth_vals <= q95)
69
+ else:
70
+ return depth_vals
71
+ return depth_vals[valid_mask] if valid_mask.sum() > 0 else depth_vals
72
+
73
+ def _collect_foreground_depths(
74
+ self,
75
+ layered_world_depth: List[Dict]
76
+ ) -> List[torch.Tensor]:
77
+ r"""
78
+ Collect depth information of all foreground layers (remove outliers)
79
+ from the layered world depth representation.
80
+ Args:
81
+ layered_world_depth: List of dictionaries containing depth information for each layer.
82
+ Returns:
83
+ List of tensors containing cleaned foreground depth values.
84
+ """
85
+ fg_depths = []
86
+ for layer_depth in layered_world_depth:
87
+ if layer_depth["name"] == "background":
88
+ continue
89
+
90
+ depth_vals = layer_depth["distance"]
91
+ mask = layer_depth.get("mask", None)
92
+
93
+ # Process the depth values within the mask area
94
+ if mask is not None:
95
+ if not isinstance(mask, torch.Tensor):
96
+ mask = torch.from_numpy(mask).to(depth_vals.device)
97
+ depth_vals = depth_vals[mask.bool()]
98
+
99
+ if depth_vals.numel() > 0:
100
+ cleaned_depths = self._remove_outliers(depth_vals)
101
+ if len(cleaned_depths) > 0:
102
+ fg_depths.append(cleaned_depths)
103
+ return fg_depths
104
+
105
+ def _get_pixelwise_foreground_max_depth(
106
+ self,
107
+ layered_world_depth: List[Dict],
108
+ bg_shape: torch.Size,
109
+ bg_device: torch.device
110
+ ) -> torch.Tensor:
111
+ r"""
112
+ Calculate the maximum foreground depth for each pixel position
113
+ Args:
114
+ layered_world_depth: List of dictionaries containing depth information for each layer.
115
+ bg_shape: Shape of the background depth tensor.
116
+ bg_device: Device where the background depth tensor is located.
117
+ Returns:
118
+ Tensor of maximum foreground depth values for each pixel position.
119
+ """
120
+ fg_max_depth = torch.zeros(bg_shape, device=bg_device)
121
+
122
+ for layer_depth in layered_world_depth:
123
+ if layer_depth["name"] == "background":
124
+ continue
125
+
126
+ layer_distance = layer_depth["distance"]
127
+ layer_mask = layer_depth.get("mask", None)
128
+
129
+ # Ensure that the tensor is on the correct device
130
+ if not isinstance(layer_distance, torch.Tensor):
131
+ layer_distance = torch.from_numpy(layer_distance).to(bg_device)
132
+ else:
133
+ layer_distance = layer_distance.to(bg_device)
134
+
135
+ # Update the maximum depth of the foreground
136
+ if layer_mask is not None:
137
+ if not isinstance(layer_mask, torch.Tensor):
138
+ layer_mask = torch.from_numpy(layer_mask).to(bg_device)
139
+ else:
140
+ layer_mask = layer_mask.to(bg_device)
141
+ fg_max_depth = torch.where(layer_mask.bool(), torch.max(
142
+ fg_max_depth, layer_distance), fg_max_depth)
143
+ else:
144
+ fg_max_depth = torch.max(fg_max_depth, layer_distance)
145
+
146
+ return fg_max_depth
147
+
148
+ def _analyze_depth_distribution(self, bg_depth_distance: torch.Tensor) -> Dict:
149
+ r"""
150
+ Analyze the distribution characteristics of background depth
151
+ Args:
152
+ bg_depth_distance: Tensor of background depth distances.
153
+ Returns:
154
+ Dictionary containing statistical properties of the background depth distribution.
155
+ """
156
+ bg_mean, bg_std = torch.mean(
157
+ bg_depth_distance), torch.std(bg_depth_distance)
158
+ cv = bg_std / bg_mean
159
+
160
+ quantiles = torch.quantile(bg_depth_distance,
161
+ torch.tensor([0.5, 0.75, 0.9, 0.95, 0.99], device=bg_depth_distance.device))
162
+ bg_q50, bg_q75, bg_q90, bg_q95, bg_q99 = quantiles
163
+
164
+ return {"mean": bg_mean, "std": bg_std, "cv": cv, "q50": bg_q50,
165
+ "q75": bg_q75, "q90": bg_q90, "q95": bg_q95, "q99": bg_q99}
166
+
167
+ def _determine_compression_strategy(self, cv: float) -> Tuple[str, float]:
168
+ r"""
169
+ Determine compression strategy based on coefficient of variation
170
+ Args:
171
+ cv: Coefficient of variation of the background depth distribution.
172
+ Returns:
173
+ Tuple containing the compression strategy ("gentle", "standard", "aggressive")
174
+ and the quantile to use for compression.
175
+ """
176
+ low_cv_threshold, high_cv_threshold = self.cv_thresholds
177
+ low_var_quantile, medium_var_quantile, high_var_quantile = self.compression_quantiles
178
+
179
+ if cv < low_cv_threshold:
180
+ return "gentle", low_var_quantile
181
+ elif cv > high_cv_threshold:
182
+ return "aggressive", high_var_quantile
183
+ else:
184
+ return "standard", medium_var_quantile
185
+
186
+ def _smooth_compression(self, depth_values: torch.Tensor, max_depth: torch.Tensor,
187
+ mask: torch.Tensor = None, transition_start_ratio: float = 0.95,
188
+ transition_range_ratio: float = 0.2, verbose: bool = False) -> torch.Tensor:
189
+ r"""
190
+ Use smooth compression function instead of hard truncation
191
+ Args:
192
+ depth_values: Tensor of depth values to compress.
193
+ max_depth: Maximum depth value for compression.
194
+ mask: Optional mask to apply compression only to certain pixels.
195
+ transition_start_ratio: Ratio to determine the start of the transition range.
196
+ transition_range_ratio: Ratio to determine the range of the transition.
197
+ verbose: Whether to print detailed information about the compression process.
198
+ Returns:
199
+ Compressed depth values as a tensor.
200
+ """
201
+ if not self.enable_smooth_compression:
202
+ compressed = depth_values.clone()
203
+ if mask is not None:
204
+ compressed[mask] = torch.clamp(
205
+ depth_values[mask], max=max_depth)
206
+ else:
207
+ compressed = torch.clamp(depth_values, max=max_depth)
208
+ return compressed
209
+
210
+ transition_start = max_depth * transition_start_ratio
211
+ transition_range = max_depth * transition_range_ratio
212
+ compressed_depth = depth_values.clone()
213
+
214
+ mask_far = depth_values > transition_start
215
+ if mask is not None:
216
+ mask_far = mask_far & mask
217
+
218
+ if mask_far.sum() > 0:
219
+ far_depths = depth_values[mask_far]
220
+ normalized = (far_depths - transition_start) / transition_range
221
+ compressed_normalized = torch.sigmoid(
222
+ normalized * 2 - 1) * 0.5 + 0.5
223
+ compressed_far = transition_start + \
224
+ compressed_normalized * (max_depth - transition_start)
225
+ compressed_depth[mask_far] = compressed_far
226
+ if verbose:
227
+ print(
228
+ f"\t Applied smooth compression to {mask_far.sum()} pixels beyond {transition_start:.2f}")
229
+ elif verbose:
230
+ print(f"\t No compression needed, all depths within reasonable range")
231
+
232
+ return compressed_depth
233
+
234
+ def compress_background_depth(self, bg_depth_distance: torch.Tensor, layered_world_depth: List[Dict],
235
+ bg_mask: torch.Tensor, verbose: bool = False) -> torch.Tensor:
236
+ r"""
237
+ Adaptive compression of background depth values
238
+ Args:
239
+ bg_depth_distance: Tensor of background depth distances.
240
+ layered_world_depth: List of dictionaries containing depth information for each layer.
241
+ bg_mask: Tensor or numpy array representing the mask for background depth.
242
+ verbose: Whether to print detailed information about the compression process.
243
+ Returns:
244
+ Compressed background depth values as a tensor.
245
+ """
246
+ if verbose:
247
+ print(f"\t - Applying adaptive depth compression...")
248
+
249
+ # Process mask
250
+ if not isinstance(bg_mask, torch.Tensor):
251
+ bg_mask = torch.from_numpy(bg_mask).to(bg_depth_distance.device)
252
+ mask_bool = bg_mask.bool()
253
+ masked_depths = bg_depth_distance[mask_bool]
254
+
255
+ if masked_depths.numel() == 0:
256
+ if verbose:
257
+ print(f"\t No valid depths in mask region, skipping compression")
258
+ return bg_depth_distance
259
+
260
+ # 1. Collect prospect depth information
261
+ fg_depths = self._collect_foreground_depths(layered_world_depth)
262
+
263
+ # 2. Calculate prospect depth statistics
264
+ if fg_depths:
265
+ all_fg_depths = torch.cat(fg_depths)
266
+ fg_max = torch.quantile(all_fg_depths, torch.tensor(
267
+ 0.99, device=all_fg_depths.device))
268
+ if verbose:
269
+ print(
270
+ f"\t Foreground depth stats - 99th percentile: {fg_max:.2f}")
271
+ else:
272
+ fg_max = torch.quantile(masked_depths, torch.tensor(
273
+ 0.5, device=masked_depths.device))
274
+ if verbose:
275
+ print(f"\t No foreground found, using background stats for reference")
276
+
277
+ # 3. Analyze the depth distribution of the background
278
+ depth_stats = self._analyze_depth_distribution(masked_depths)
279
+ if verbose:
280
+ print(
281
+ f"\t Background depth stats - mean: {depth_stats['mean']:.2f}, \
282
+ std: {depth_stats['std']:.2f}, CV: {depth_stats['cv']:.3f}")
283
+
284
+ # 4. Determine compression strategy
285
+ strategy, compression_quantile = self._determine_compression_strategy(
286
+ depth_stats['cv'])
287
+ max_depth = torch.quantile(masked_depths, torch.tensor(
288
+ compression_quantile, device=masked_depths.device))
289
+
290
+ if verbose:
291
+ print(
292
+ f"\t {strategy.capitalize()} compression strategy \
293
+ (CV={depth_stats['cv']:.3f}), quantile={compression_quantile}")
294
+
295
+ # 5. Pixel level depth constraint
296
+ if fg_depths:
297
+ fg_max_depth_pixelwise = self._get_pixelwise_foreground_max_depth(
298
+ layered_world_depth, bg_depth_distance.shape, bg_depth_distance.device)
299
+ required_min_bg_depth = fg_max_depth_pixelwise * self.fg_bg_depth_margin
300
+ pixelwise_violations = (
301
+ bg_depth_distance < required_min_bg_depth) & mask_bool
302
+
303
+ if pixelwise_violations.sum() > 0:
304
+ violation_ratio = pixelwise_violations.float().sum() / mask_bool.float().sum()
305
+ violated_required_depths = required_min_bg_depth[pixelwise_violations]
306
+ pixelwise_min_depth = torch.quantile(violated_required_depths, torch.tensor(
307
+ 0.99, device=violated_required_depths.device))
308
+ max_depth = torch.max(max_depth, pixelwise_min_depth)
309
+ if verbose:
310
+ print(
311
+ f"\t Pixelwise constraint violation: {violation_ratio:.1%}, \
312
+ adjusted max depth to {max_depth:.2f}")
313
+ elif verbose:
314
+ print(f"\t Pixelwise depth constraints satisfied")
315
+
316
+ # 6. Global statistical constraints
317
+ if fg_depths:
318
+ min_bg_depth = fg_max * self.fg_bg_depth_margin
319
+ max_depth = torch.max(max_depth, min_bg_depth)
320
+ if verbose:
321
+ print(f"\t Final max depth: {max_depth:.2f}")
322
+
323
+ # 6.5. Depth threshold check: If max_depth is less than the threshold, skip compression
324
+ if max_depth < self.min_compression_depth:
325
+ if verbose:
326
+ print(
327
+ f"\t Max depth {max_depth:.2f} is below threshold \
328
+ {self.min_compression_depth:.2f}, skipping compression")
329
+ return bg_depth_distance
330
+
331
+ # 7. Application compression
332
+ compressed_depth = self._smooth_compression(
333
+ bg_depth_distance, max_depth, mask_bool, 0.9, 0.2, verbose)
334
+
335
+ # 8. Hard truncation of extreme outliers
336
+ final_max = max_depth * 1.2
337
+ outliers = (compressed_depth > final_max) & mask_bool
338
+ if outliers.sum() > 0:
339
+ compressed_depth[outliers] = final_max
340
+
341
+ # 9. statistic
342
+ compression_ratio = ((bg_depth_distance > max_depth)
343
+ & mask_bool).float().sum() / mask_bool.float().sum()
344
+ if verbose:
345
+ print(
346
+ f"\t Compression summary - max depth: \
347
+ {max_depth:.2f}, affected: {compression_ratio:.1%}")
348
+
349
+ return compressed_depth
350
+
351
+ def compress_foreground_depth(
352
+ self,
353
+ fg_depth_distance: torch.Tensor,
354
+ fg_mask: torch.Tensor,
355
+ verbose: bool = False,
356
+ conservative_ratio: float = 0.99,
357
+ iqr_scale: float = 2
358
+ ) -> torch.Tensor:
359
+ r"""
360
+ Conservatively compress outliers for foreground depth
361
+ Args:
362
+ fg_depth_distance: Tensor of foreground depth distances.
363
+ fg_mask: Tensor or numpy array representing the mask for foreground depth.
364
+ verbose: Whether to print detailed information about the compression process.
365
+ conservative_ratio: Ratio to use for conservative compression.
366
+ iqr_scale: Scale factor for IQR-based upper bound.
367
+ Returns:
368
+ Compressed foreground depth values as a tensor.
369
+ """
370
+ if verbose:
371
+ print(f"\t - Applying conservative foreground depth compression...")
372
+
373
+ # Process mask
374
+ if not isinstance(fg_mask, torch.Tensor):
375
+ fg_mask = torch.from_numpy(fg_mask).to(fg_depth_distance.device)
376
+ mask_bool = fg_mask.bool()
377
+ masked_depths = fg_depth_distance[mask_bool]
378
+
379
+ if masked_depths.numel() == 0:
380
+ if verbose:
381
+ print(f"\t No valid depths in mask region, skipping compression")
382
+ return fg_depth_distance
383
+
384
+ # Calculate statistical information
385
+ depth_mean, depth_std = torch.mean(
386
+ masked_depths), torch.std(masked_depths)
387
+
388
+ # Determine the upper bound using IQR and quantile methods
389
+ q25, q75 = torch.quantile(masked_depths, torch.tensor(
390
+ [0.25, 0.75], device=masked_depths.device))
391
+ iqr = q75 - q25
392
+ upper_bound = q75 + iqr_scale * iqr
393
+ conservative_max = torch.quantile(masked_depths, torch.tensor(
394
+ conservative_ratio, device=masked_depths.device))
395
+ final_max = torch.max(upper_bound, conservative_max)
396
+
397
+ # Statistical Outliers
398
+ outliers = (fg_depth_distance > final_max) & mask_bool
399
+ outlier_count = outliers.sum().item()
400
+
401
+ if verbose:
402
+ print(
403
+ f"\t Depth stats - mean: {depth_mean:.2f}, std: {depth_std:.2f}")
404
+ print(
405
+ f"\t IQR bounds - Q25: {q25:.2f}, Q75: {q75:.2f}, upper: {upper_bound:.2f}")
406
+ print(
407
+ f"\t Conservative max: {conservative_max:.2f}, final max: {final_max:.2f}")
408
+ print(
409
+ f"\t Outliers: {outlier_count} ({(outlier_count/masked_depths.numel()*100):.2f}%)")
410
+
411
+ # Depth threshold check: If final_max is less than the threshold, skip compression
412
+ if final_max < self.min_compression_depth:
413
+ if verbose:
414
+ print(
415
+ f"\t Final max depth {final_max:.2f} is below threshold \
416
+ {self.min_compression_depth:.2f}, skipping compression")
417
+ return fg_depth_distance
418
+
419
+ # Apply compression
420
+ if outlier_count > 0:
421
+ compressed_depth = self._smooth_compression(
422
+ fg_depth_distance, final_max, mask_bool, 0.99, 0.1, verbose)
423
+ else:
424
+ compressed_depth = fg_depth_distance.clone()
425
+
426
+ return compressed_depth
427
+
428
+
429
+ def create_adaptive_depth_compressor(
430
+ scene_type: str = "auto",
431
+ enable_smooth_compression: bool = True,
432
+ outlier_removal_method: str = "iqr",
433
+ min_compression_depth: float = 6.0, # Minimum compression depth threshold
434
+ ) -> AdaptiveDepthCompressor:
435
+ r"""
436
+ Create adaptive depth compressors suitable for different scene types
437
+ Args:
438
+ scene_type: Scenario Type ("indoor", "outdoor", "mixed", "auto")
439
+ enable_smooth_compression: enable smooth compression or not
440
+ outlier_removal_method: Outlier removal method ("iqr", "quantile", "none")
441
+ """
442
+ common_params = {
443
+ "enable_smooth_compression": enable_smooth_compression,
444
+ "outlier_removal_method": outlier_removal_method,
445
+ "min_compression_depth": min_compression_depth,
446
+ }
447
+
448
+ if scene_type == "indoor":
449
+ # Indoor scene: Depth variation is relatively small, conservative compression is used
450
+ return AdaptiveDepthCompressor(
451
+ cv_thresholds=(0.2, 0.6),
452
+ compression_quantiles=(1.0, 0.975, 0.95),
453
+ fg_bg_depth_margin=1.05,
454
+ **common_params
455
+ )
456
+ elif scene_type == "outdoor":
457
+ # Outdoor scenes: There may be sky, distant mountains, etc., using more aggressive compression
458
+ return AdaptiveDepthCompressor(
459
+ cv_thresholds=(0.4, 1.0),
460
+ compression_quantiles=(0.98, 0.955, 0.93),
461
+ fg_bg_depth_margin=1.15,
462
+ **common_params
463
+ )
464
+ elif scene_type == "mixed":
465
+ # Mixed Scene: Balanced Settings
466
+ return AdaptiveDepthCompressor(
467
+ cv_thresholds=(0.3, 0.8),
468
+ compression_quantiles=(0.99, 0.97, 0.95),
469
+ fg_bg_depth_margin=1.1,
470
+ **common_params
471
+ )
472
+ else: # auto
473
+ # Automatic mode: Use default settings
474
+ return AdaptiveDepthCompressor(**common_params)
hy3dworld/models/layer_decomposer.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ from ..utils import sr_utils, seg_utils, inpaint_utils, layer_utils
5
+
6
+
7
+ class LayerDecomposition():
8
+ r"""LayerDecomposition is responsible for generating layers in a scene based on input images and masks.
9
+ It processes foreground objects, background layers, and sky regions using various models.
10
+ Args:
11
+ seed (int): Random seed for reproducibility.
12
+ strength (float): Strength of the layer generation.
13
+ threshold (int): Threshold for object detection.
14
+ ratio (float): Ratio for scaling objects.
15
+ grounding_model (str): Path to the grounding model for object detection.
16
+ zim_model_config (str): Configuration for the ZIM model.
17
+ zim_checkpoint (str): Path to the ZIM model checkpoint.
18
+ inpaint_model (str): Path to the inpainting model.
19
+ inpaint_fg_lora (str): Path to the LoRA weights for foreground inpainting.
20
+ inpaint_sky_lora (str): Path to the LoRA weights for sky inpainting.
21
+ scale (int): Scale factor for super-resolution.
22
+ device (str): Device to run the model on, either "cuda" or "cpu".
23
+ dilation_size (int): Size of the dilation for mask processing.
24
+ cfg_scale (float): Configuration scale for the model.
25
+ prompt_config (dict): Configuration for prompts used in the model.
26
+ """
27
+ def __init__(self):
28
+ r"""Initialize the LayerDecomposition class with model paths and parameters."""
29
+ self.seed = 25
30
+ self.strength = 1.0
31
+ self.threshold = 20000
32
+ self.ratio = 1.5
33
+ self.grounding_model = "IDEA-Research/grounding-dino-tiny"
34
+ self.zim_model_config = "vit_l"
35
+ self.zim_checkpoint = "./ZIM/zim_vit_l_2092" # Add zim anything ckpt here
36
+ self.inpaint_model = "black-forest-labs/FLUX.1-Fill-dev"
37
+ self.inpaint_fg_lora = "tencent/HunyuanWorld-1"
38
+ self.inpaint_sky_lora = "tencent/HunyuanWorld-1"
39
+ self.scale = 2
40
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ self.dilation_size = 80
42
+ self.cfg_scale = 5.0
43
+ self.prompt_config = {
44
+ "indoor": {
45
+ "positive_prompt": "",
46
+ "negative_prompt": (
47
+ "object, table, chair, seat, shelf, sofa, bed, bath, sink,"
48
+ "ceramic, wood, plant, tree, light, lamp, candle, television, electronics,"
49
+ "oven, fire, low-resolution, blur, mosaic, people")
50
+ },
51
+ "outdoor": {
52
+ "positive_prompt": "",
53
+ "negative_prompt": (
54
+ "object, chair, tree, plant, flower, grass, stone, rock,"
55
+ "building, hill, house, tower, light, lamp, low-resolution, blur, mosaic, people")
56
+ }
57
+ }
58
+
59
+ # Load models
60
+ print("============= now loading models ===============")
61
+ # super-resolution model
62
+ self.sr_model = sr_utils.build_sr_model(scale=self.scale, gpu_id=0)
63
+ print("============= load Super-Resolution models done ")
64
+ # segmentation model
65
+ self.zim_predictor = seg_utils.build_zim_model(
66
+ self.zim_model_config, self.zim_checkpoint, device='cuda:0')
67
+ self.gd_processor, self.gd_model = seg_utils.build_gd_model(
68
+ self.grounding_model, device='cuda:0')
69
+ print("============= load Segmentation models done ====")
70
+ # panorama inpaint model
71
+ self.inpaint_fg_model = inpaint_utils.build_inpaint_model(
72
+ self.inpaint_model,
73
+ self.inpaint_fg_lora,
74
+ subfolder="HunyuanWorld-PanoInpaint-Scene",
75
+ device=0
76
+ )
77
+ self.inpaint_sky_model = inpaint_utils.build_inpaint_model(
78
+ self.inpaint_model,
79
+ self.inpaint_sky_lora,
80
+ subfolder="HunyuanWorld-PanoInpaint-Sky",
81
+ device=0
82
+ )
83
+ print("============= load panorama inpaint models done =")
84
+
85
+ def __call__(self, input, layer):
86
+ r"""Generate layers based on the input images and masks.
87
+ Args:
88
+ input (str or list): Path to the input JSON file or a list of image information.
89
+ layer (int): Layer index to process (0 for foreground1, 1 for foreground2,
90
+ 2 for sky).
91
+ Raises:
92
+ FileNotFoundError: If the input file does not exist.
93
+ ValueError: If the input file is not a JSON file or if the layer index is invalid.
94
+ TypeError: If the input is neither a string nor a list.
95
+ """
96
+ torch.autocast(device_type=self.device,
97
+ dtype=torch.bfloat16).__enter__()
98
+
99
+ # Input handling and validation
100
+ if isinstance(input, str):
101
+ if not os.path.exists(input):
102
+ raise FileNotFoundError(f"Input file {input} does not exist.")
103
+ if not input.endswith('.json'):
104
+ raise ValueError("Input file must be a JSON file.")
105
+ with open(input, "r") as f:
106
+ img_infos = json.load(f)
107
+ img_infos = img_infos["output"]
108
+ elif isinstance(input, list):
109
+ img_infos = input
110
+ else:
111
+ raise TypeError("Input must be a JSON file path or a list.")
112
+
113
+ # Processing parameters
114
+ params = {
115
+ 'scale': self.scale,
116
+ 'seed': self.seed,
117
+ 'threshold': self.threshold,
118
+ 'ratio': self.ratio,
119
+ 'strength': self.strength,
120
+ 'dilation_size': self.dilation_size,
121
+ 'cfg_scale': self.cfg_scale,
122
+ 'prompt_config': self.prompt_config
123
+ }
124
+
125
+ # Layer-specific processing pipelines
126
+ if layer == 0:
127
+ layer_utils.remove_fg1_pipeline(
128
+ img_infos=img_infos,
129
+ sr_model=self.sr_model,
130
+ zim_predictor=self.zim_predictor,
131
+ gd_processor=self.gd_processor,
132
+ gd_model=self.gd_model,
133
+ inpaint_model=self.inpaint_fg_model,
134
+ params=params
135
+ )
136
+ elif layer == 1:
137
+ layer_utils.remove_fg2_pipeline(
138
+ img_infos=img_infos,
139
+ sr_model=self.sr_model,
140
+ zim_predictor=self.zim_predictor,
141
+ gd_processor=self.gd_processor,
142
+ gd_model=self.gd_model,
143
+ inpaint_model=self.inpaint_fg_model,
144
+ params=params
145
+ )
146
+ else:
147
+ layer_utils.sky_pipeline(
148
+ img_infos=img_infos,
149
+ sr_model=self.sr_model,
150
+ zim_predictor=self.zim_predictor,
151
+ gd_processor=self.gd_processor,
152
+ gd_model=self.gd_model,
153
+ inpaint_model=self.inpaint_sky_model,
154
+ params=params
155
+ )
hy3dworld/models/pano_generator.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
2
+ # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
3
+ # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
+ # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
5
+ # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
6
+ # You will be deemed to have recognized and accepted the content of this Agreement,
7
+ # which is effective immediately.
8
+
9
+ # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
10
+ # and their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
14
+
15
+ import torch
16
+ from transformers import (
17
+ CLIPTextModel,
18
+ CLIPTokenizer,
19
+ T5EncoderModel,
20
+ T5TokenizerFast,
21
+ )
22
+
23
+ from diffusers.image_processor import VaeImageProcessor
24
+ from diffusers.models.autoencoders import AutoencoderKL
25
+
26
+ from diffusers.models.transformers import FluxTransformer2DModel
27
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
28
+
29
+ from diffusers.utils.torch_utils import randn_tensor
30
+
31
+ from .pipelines import FluxPipeline, FluxFillPipeline
32
+
33
+ class Text2PanoramaPipelines(FluxPipeline):
34
+ @torch.no_grad()
35
+ def __call__(self, prompt, **kwargs):
36
+ """Main inpainting call."""
37
+ return self._call_shared(prompt=prompt, is_inpainting=False, early_steps=3, **kwargs)
38
+
39
+
40
+ class Image2PanoramaPipelines(FluxFillPipeline):
41
+ def __init__(
42
+ self,
43
+ scheduler: FlowMatchEulerDiscreteScheduler,
44
+ vae: AutoencoderKL,
45
+ text_encoder: CLIPTextModel,
46
+ tokenizer: CLIPTokenizer,
47
+ text_encoder_2: T5EncoderModel,
48
+ tokenizer_2: T5TokenizerFast,
49
+ transformer: FluxTransformer2DModel,
50
+ ):
51
+ # Initilization from FluxFillPipeline
52
+ super().__init__(
53
+ scheduler=scheduler,
54
+ vae=vae,
55
+ text_encoder=text_encoder,
56
+ tokenizer=tokenizer,
57
+ text_encoder_2=text_encoder_2,
58
+ tokenizer_2=tokenizer_2,
59
+ transformer=transformer,
60
+ )
61
+
62
+ # change some part of initilization
63
+ self.latent_channels = self.vae.config.latent_channels if getattr(
64
+ self, "vae", None) else 16
65
+
66
+ self.mask_processor = VaeImageProcessor(
67
+ vae_scale_factor=self.vae_scale_factor * 2,
68
+ vae_latent_channels=self.latent_channels,
69
+ do_normalize=False,
70
+ do_binarize=True,
71
+ do_convert_grayscale=True,
72
+ )
73
+
74
+ def get_timesteps(self, num_inference_steps, strength, device):
75
+ # get the original timestep using init_timestep
76
+ init_timestep = min(num_inference_steps *
77
+ strength, num_inference_steps)
78
+
79
+ t_start = int(max(num_inference_steps - init_timestep, 0))
80
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
81
+ if hasattr(self.scheduler, "set_begin_index"):
82
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
83
+
84
+ return timesteps, num_inference_steps - t_start
85
+
86
+ def prepare_inpainting_latents(
87
+ self,
88
+ batch_size,
89
+ num_channels_latents,
90
+ height,
91
+ width,
92
+ dtype,
93
+ device,
94
+ generator,
95
+ latents=None,
96
+ image=None,
97
+ is_strength_max=True,
98
+ timestep=None,
99
+ ):
100
+ r"""
101
+ Prepares the latents for the Image2PanoramaPipelines.
102
+ """
103
+ if isinstance(generator, list) and len(generator) != batch_size:
104
+ raise ValueError(
105
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
106
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
107
+ )
108
+
109
+ # VAE applies 8x compression on images but we must also account for packing which requires
110
+ # latent height and width to be divisible by 2.
111
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
112
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
113
+ shape = (batch_size, num_channels_latents, height, width)
114
+
115
+ # Return the latents if they are already provided
116
+ if latents is not None:
117
+ return latents.to(device=device, dtype=dtype), latent_image_ids
118
+
119
+ # If no latents are provided, we need to encode the image
120
+ image = image.to(device=device, dtype=dtype)
121
+ if image.shape[1] != self.latent_channels:
122
+ image_latents = self._encode_vae_image(
123
+ image=image, generator=generator)
124
+ else:
125
+ image_latents = image
126
+
127
+ # Ensure image_latents has the correct shape
128
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
129
+ # expand init_latents for batch_size
130
+ additional_image_per_prompt = batch_size // image_latents.shape[0]
131
+ image_latents = torch.cat(
132
+ [image_latents] * additional_image_per_prompt, dim=0)
133
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
134
+ raise ValueError(
135
+ f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
136
+ )
137
+ else:
138
+ image_latents = torch.cat([image_latents], dim=0)
139
+ # Add noise to the latents
140
+ noise = randn_tensor(shape, generator=generator,
141
+ device=device, dtype=dtype)
142
+ latents = self.scheduler.scale_noise(image_latents, timestep, noise)
143
+
144
+ # prepare blended latents
145
+ latents = torch.cat(
146
+ [latents, latents[:, :, :, :self.blend_extend]], dim=-1)
147
+ width_new_blended = latents.shape[-1]
148
+ latents = self._pack_latents(
149
+ latents, batch_size, num_channels_latents, height, width_new_blended)
150
+ # prepare latent image ids
151
+ latent_image_ids = self._prepare_latent_image_ids(
152
+ batch_size, height // 2, width_new_blended // 2, device, dtype)
153
+
154
+ return latents, latent_image_ids, width_new_blended
155
+
156
+ def prepare_blending_latent(
157
+ self, latents, height, width, batch_size, num_channels_latents, width_new_blended=None
158
+ ):
159
+ return latents, width_new_blended
160
+
161
+ def _apply_blending(
162
+ self,
163
+ latents: torch.Tensor,
164
+ height: int,
165
+ width_new_blended: int,
166
+ num_channels_latents: int,
167
+ batch_size: int,
168
+ **karwgs,
169
+ ) -> torch.Tensor:
170
+ r"""Apply horizontal blending to latents."""
171
+ # Unpack latents for processing
172
+ latents_unpack = self._unpack_latents(
173
+ latents, height, width_new_blended*self.vae_scale_factor, self.vae_scale_factor
174
+ )
175
+ # Apply blending
176
+ latents_unpack = self.blend_h(latents_unpack, latents_unpack, self.blend_extend)
177
+
178
+ latent_height = 2 * \
179
+ (int(height) // (self.vae_scale_factor * 2))
180
+
181
+ shifting_extend = karwgs.get("shifting_extend", None)
182
+ if shifting_extend is None:
183
+ shifting_extend = latents_unpack.size()[-1]//4
184
+
185
+ latents_unpack = torch.roll(
186
+ latents_unpack, shifting_extend, -1)
187
+
188
+ # Repack latents after blending
189
+ latents = self._pack_latents(
190
+ latents_unpack, batch_size, num_channels_latents, latent_height, width_new_blended)
191
+ return latents
192
+
193
+ def _apply_blending_mask(
194
+ self,
195
+ latents: torch.Tensor,
196
+ height: int,
197
+ width_new_blended: int,
198
+ num_channels_latents: int,
199
+ batch_size: int,
200
+ **kwargs
201
+ ) -> torch.Tensor:
202
+ r"""Apply horizontal blending to mask latents."""
203
+ return self._apply_blending(
204
+ latents, height, width_new_blended, 80, batch_size, **kwargs
205
+ )
206
+
207
+ def _final_process_latents(
208
+ self,
209
+ latents: torch.Tensor,
210
+ height: int,
211
+ width_new_blended: int,
212
+ width: int
213
+ ) -> torch.Tensor:
214
+ """Final processing of latents before decoding."""
215
+ # Unpack and crop to target width
216
+ latents_unpack = self._unpack_latents(
217
+ latents, height, width_new_blended * self.vae_scale_factor, self.vae_scale_factor
218
+ )
219
+ latents_unpack = self.blend_h(
220
+ latents_unpack, latents_unpack, self.blend_extend
221
+ )
222
+ latents_unpack = latents_unpack[:, :, :, :width // self.vae_scale_factor]
223
+
224
+ # Repack for final output
225
+ return self._pack_latents(
226
+ latents_unpack,
227
+ latents.shape[0], # batch size
228
+ latents.shape[2] // 4, # num_channels_latents
229
+ height // self.vae_scale_factor,
230
+ width // self.vae_scale_factor
231
+ )
232
+
233
+ @torch.no_grad()
234
+ def __call__(self, **kwargs):
235
+ """Main inpainting call."""
236
+ return self._call_shared(is_inpainting=True, early_steps=3, blend_extra_chanel=True, **kwargs)