Spaces:
Build error
Build error
Initial commit of Gradio app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +14 -0
- .gitignore +3 -0
- LICENSE +81 -0
- Miniconda3-latest-MacOSX-arm64.sh +3 -0
- README.md +232 -12
- README_zh_cn.md +224 -0
- app.py +245 -0
- assets/application.png +3 -0
- assets/arch.jpg +3 -0
- assets/panorama1.gif +3 -0
- assets/panorama2.gif +3 -0
- assets/qrcode/discord.png +0 -0
- assets/qrcode/wechat.png +0 -0
- assets/qrcode/x.png +0 -0
- assets/qrcode/xiaohongshu.png +0 -0
- assets/quick_look.gif +3 -0
- assets/roaming_world.gif +3 -0
- assets/teaser.png +3 -0
- demo_panogen.py +223 -0
- demo_scenegen.py +120 -0
- docker/HunyuanWorld.osx-cpu.yaml +142 -0
- docker/HunyuanWorld.osx64.yaml +247 -0
- docker/HunyuanWorld.yaml +246 -0
- docker/HunyuanWorld_mac.yaml +186 -0
- examples/case1/classes.txt +1 -0
- examples/case1/input.png +3 -0
- examples/case2/classes.txt +1 -0
- examples/case2/input.png +3 -0
- examples/case2/labels_fg1.txt +1 -0
- examples/case2/labels_fg2.txt +1 -0
- examples/case3/classes.txt +1 -0
- examples/case3/input.png +3 -0
- examples/case4/classes.txt +1 -0
- examples/case4/prompt.txt +1 -0
- examples/case5/classes.txt +1 -0
- examples/case5/input.png +3 -0
- examples/case6/classes.txt +1 -0
- examples/case6/input.png +3 -0
- examples/case6/labels_fg1.txt +1 -0
- examples/case7/classes.txt +1 -0
- examples/case7/prompt.txt +1 -0
- examples/case8/classes.txt +1 -0
- examples/case8/input.png +3 -0
- examples/case9/classes.txt +1 -0
- examples/case9/prompt.txt +1 -0
- hy3dworld/__init__.py +22 -0
- hy3dworld/models/__init__.py +29 -0
- hy3dworld/models/adaptive_depth_compression.py +474 -0
- hy3dworld/models/layer_decomposer.py +155 -0
- hy3dworld/models/pano_generator.py +236 -0
.gitattributes
CHANGED
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Miniconda3-latest-MacOSX-arm64.sh filter=lfs diff=lfs merge=lfs -text
|
37 |
+
assets/application.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
assets/arch.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
assets/panorama1.gif filter=lfs diff=lfs merge=lfs -text
|
40 |
+
assets/panorama2.gif filter=lfs diff=lfs merge=lfs -text
|
41 |
+
assets/quick_look.gif filter=lfs diff=lfs merge=lfs -text
|
42 |
+
assets/roaming_world.gif filter=lfs diff=lfs merge=lfs -text
|
43 |
+
assets/teaser.png filter=lfs diff=lfs merge=lfs -text
|
44 |
+
examples/case1/input.png filter=lfs diff=lfs merge=lfs -text
|
45 |
+
examples/case2/input.png filter=lfs diff=lfs merge=lfs -text
|
46 |
+
examples/case3/input.png filter=lfs diff=lfs merge=lfs -text
|
47 |
+
examples/case5/input.png filter=lfs diff=lfs merge=lfs -text
|
48 |
+
examples/case6/input.png filter=lfs diff=lfs merge=lfs -text
|
49 |
+
examples/case8/input.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore Python bytecode files
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
LICENSE
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
Tencent HunyuanWorld-1.0 Release Date: July 27, 2025
|
3 |
+
THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
|
5 |
+
1. DEFINITIONS.
|
6 |
+
a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
|
7 |
+
b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent HunyuanWorld-1.0 Works or any portion or element thereof set forth herein.
|
8 |
+
c. “Documentation” shall mean the specifications, manuals and documentation for Tencent HunyuanWorld-1.0 made publicly available by Tencent.
|
9 |
+
d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
|
10 |
+
e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent HunyuanWorld-1.0 Works for any purpose and in any field of use.
|
11 |
+
f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent HunyuanWorld-1.0 and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
|
12 |
+
g. “Model Derivatives” shall mean all: (i) modifications to Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0; (ii) works based on Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent HunyuanWorld-1.0 or any Model Derivative of Tencent HunyuanWorld-1.0, to that model in order to cause that model to perform similarly to Tencent HunyuanWorld-1.0 or a Model Derivative of Tencent HunyuanWorld-1.0, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent HunyuanWorld-1.0 or a Model Derivative of Tencent HunyuanWorld-1.0 for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
|
13 |
+
h. “Output” shall mean the information and/or content output of Tencent HunyuanWorld-1.0 or a Model Derivative that results from operating or otherwise using Tencent HunyuanWorld-1.0 or a Model Derivative, including via a Hosted Service.
|
14 |
+
i. “Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials..
|
15 |
+
j. “Tencent HunyuanWorld-1.0” shall mean the 3D generation models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
16 |
+
k. “Tencent HunyuanWorld-1.0 Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
|
17 |
+
l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
|
18 |
+
m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
|
19 |
+
n. “including” shall mean including but not limited to.
|
20 |
+
2. GRANT OF RIGHTS.
|
21 |
+
We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
|
22 |
+
3. DISTRIBUTION.
|
23 |
+
You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent HunyuanWorld-1.0 Works, exclusively in the Territory, provided that You meet all of the following conditions:
|
24 |
+
a. You must provide all such Third Party recipients of the Tencent HunyuanWorld-1.0 Works or products or services using them a copy of this Agreement;
|
25 |
+
b. You must cause any modified files to carry prominent notices stating that You changed the files;
|
26 |
+
c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent HunyuanWorld-1.0 Works; and (ii) mark the products or services developed by using the Tencent HunyuanWorld-1.0 Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
|
27 |
+
d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent HunyuanWorld-1.0 is licensed under the Tencent HunyuanWorld-1.0 Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
|
28 |
+
You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent HunyuanWorld-1.0 Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
|
29 |
+
4. ADDITIONAL COMMERCIAL TERMS.
|
30 |
+
If, on the Tencent HunyuanWorld-1.0 version release date, the monthly active users of all products or services made available by or for Licensee is greater than 1 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
|
31 |
+
Subject to Tencent's written approval, you may request a license for the use of Tencent HunyuanWorld-1.0 by submitting the following information to [email protected]:
|
32 |
+
a. Your company’s name and associated business sector that plans to use Tencent HunyuanWorld-1.0.
|
33 |
+
b. Your intended use case and the purpose of using Tencent HunyuanWorld-1.0.
|
34 |
+
c. Your plans to modify Tencent HunyuanWorld-1.0 or create Model Derivatives.
|
35 |
+
5. RULES OF USE.
|
36 |
+
a. Your use of the Tencent HunyuanWorld-1.0 Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent HunyuanWorld-1.0 Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent HunyuanWorld-1.0 Works and You must provide notice to subsequent users to whom You distribute that Tencent HunyuanWorld-1.0 Works are subject to the use restrictions in these Sections 5(a) and 5(b).
|
37 |
+
b. You must not use the Tencent HunyuanWorld-1.0 Works or any Output or results of the Tencent HunyuanWorld-1.0 Works to improve any other AI model (other than Tencent HunyuanWorld-1.0 or Model Derivatives thereof).
|
38 |
+
c. You must not use, reproduce, modify, distribute, or display the Tencent HunyuanWorld-1.0 Works, Output or results of the Tencent HunyuanWorld-1.0 Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
|
39 |
+
6. INTELLECTUAL PROPERTY.
|
40 |
+
a. Subject to Tencent’s ownership of Tencent HunyuanWorld-1.0 Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
|
41 |
+
b. No trademark licenses are granted under this Agreement, and in connection with the Tencent HunyuanWorld-1.0 Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent HunyuanWorld-1.0 Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
|
42 |
+
c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent HunyuanWorld-1.0 Works.
|
43 |
+
d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
|
44 |
+
7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
|
45 |
+
a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent HunyuanWorld-1.0 Works or to grant any license thereto.
|
46 |
+
b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUANWORLD-1.0 WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
|
47 |
+
c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUANWORLD-1.0 WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
|
48 |
+
8. SURVIVAL AND TERMINATION.
|
49 |
+
a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
|
50 |
+
b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent HunyuanWorld-1.0 Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
|
51 |
+
9. GOVERNING LAW AND JURISDICTION.
|
52 |
+
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
|
53 |
+
b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
|
54 |
+
|
55 |
+
EXHIBIT A
|
56 |
+
ACCEPTABLE USE POLICY
|
57 |
+
|
58 |
+
Tencent reserves the right to update this Acceptable Use Policy from time to time.
|
59 |
+
Last modified: November 5, 2024
|
60 |
+
|
61 |
+
Tencent endeavors to promote safe and fair use of its tools and features, including Tencent HunyuanWorld-1.0. You agree not to use Tencent HunyuanWorld-1.0 or Model Derivatives:
|
62 |
+
1. Outside the Territory;
|
63 |
+
2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
|
64 |
+
3. To harm Yourself or others;
|
65 |
+
4. To repurpose or distribute output from Tencent HunyuanWorld-1.0 or any Model Derivatives to harm Yourself or others;
|
66 |
+
5. To override or circumvent the safety guardrails and safeguards We have put in place;
|
67 |
+
6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
|
68 |
+
7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
|
69 |
+
8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
|
70 |
+
9. To intentionally defame, disparage or otherwise harass others;
|
71 |
+
10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
|
72 |
+
11. To generate or disseminate personal identifiable information with the purpose of harming others;
|
73 |
+
12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
|
74 |
+
13. To impersonate another individual without consent, authorization, or legal right;
|
75 |
+
14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
|
76 |
+
15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
|
77 |
+
16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
|
78 |
+
17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
|
79 |
+
18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
|
80 |
+
19. For military purposes;
|
81 |
+
20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
|
Miniconda3-latest-MacOSX-arm64.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ec6f7981770b3396a9ab426e07ac8ef5b12b4393aa2e4bcc984376fe3aa327e
|
3 |
+
size 114835350
|
README.md
CHANGED
@@ -1,12 +1,232 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[中文阅读](README_zh_cn.md)
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<img src="assets/teaser.png">
|
5 |
+
</p>
|
6 |
+
|
7 |
+
<div align="center">
|
8 |
+
<a href=https://3d.hunyuan.tencent.com/sceneTo3D target="_blank"><img src=https://img.shields.io/badge/Official%20Site-333399.svg?logo=homepage height=22px></a>
|
9 |
+
<a href=https://huggingface.co/tencent/HunyuanWorld-1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
|
10 |
+
<a href=https://3d-models.hunyuan.tencent.com/world/ target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
|
11 |
+
<a href=https://discord.gg/dNBrdrGGMa target="_blank"><img src= https://img.shields.io/badge/Discord-white.svg?logo=discord height=22px></a>
|
12 |
+
<a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
|
13 |
+
<a href="#community-resources" target="_blank"><img src=https://img.shields.io/badge/Community-lavender.svg?logo=homeassistantcommunitystore height=22px></a>
|
14 |
+
</div>
|
15 |
+
|
16 |
+
[//]: # ( <a href=# target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>)
|
17 |
+
|
18 |
+
[//]: # ( <a href=# target="_blank"><img src= https://img.shields.io/badge/Colab-8f2628.svg?logo=googlecolab height=22px></a>)
|
19 |
+
|
20 |
+
[//]: # ( <a href="#"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/v/mulankit?logo=pypi" height=22px></a>)
|
21 |
+
|
22 |
+
<br>
|
23 |
+
|
24 |
+
<p align="center">
|
25 |
+
"To see a World in a Grain of Sand, and a Heaven in a Wild Flower"
|
26 |
+
</p>
|
27 |
+
|
28 |
+
https://github.com/user-attachments/assets/513c9529-2b34-4872-b38f-4f291f3ae1c7
|
29 |
+
|
30 |
+
## 🔥 News
|
31 |
+
- July 26, 2025: 👋 We present the technical report of HunyuanWorld-1.0, please check out the details and spark some discussion!
|
32 |
+
- July 26, 2025: 🤗 We release the first open-source, simulation-capable, immersive 3D world generation model, HunyuanWorld-1.0!
|
33 |
+
|
34 |
+
> Join our **[Wechat](#)** and **[Discord](https://discord.gg/dNBrdrGGMa)** group to discuss and find help from us.
|
35 |
+
|
36 |
+
| Wechat Group | Xiaohongshu | X | Discord |
|
37 |
+
|--------------------------------------------------|-------------------------------------------------------|---------------------------------------------|---------------------------------------------------|
|
38 |
+
| <img src="assets/qrcode/wechat.png" height=140> | <img src="assets/qrcode/xiaohongshu.png" height=140> | <img src="assets/qrcode/x.png" height=140> | <img src="assets/qrcode/discord.png" height=140> |
|
39 |
+
|
40 |
+
## ☯️ **HunyuanWorld 1.0**
|
41 |
+
|
42 |
+
### Abstract
|
43 |
+
Creating immersive and playable 3D worlds from texts or images remains a fundamental challenge in computer vision and graphics. Existing world generation approaches typically fall into two categories: video-based methods that offer rich diversity but lack 3D consistency and rendering efficiency, and 3D-based methods that provide geometric consistency but struggle with limited training data and memory-inefficient representations. To address these limitations, we present HunyuanWorld 1.0, a novel framework that combines the best of both sides for generating immersive, explorable, and interactive 3D worlds from text and image conditions. Our approach features three key advantages: 1) 360° immersive experiences via panoramic world proxies; 2) mesh export capabilities for seamless compatibility with existing computer graphics pipelines; 3) disentangled object representations for augmented interactivity. The core of our framework is a semantically layered 3D mesh representation that leverages panoramic images as 360° world proxies for semantic-aware world decomposition and reconstruction, enabling the generation of diverse 3D worlds. Extensive experiments demonstrate that our method achieves state-of-the-art performance in generating coherent, explorable, and interactive 3D worlds while enabling versatile applications in virtual reality, physical simulation, game development, and interactive content creation.
|
44 |
+
|
45 |
+
<p align="center">
|
46 |
+
<img src="assets/application.png">
|
47 |
+
</p>
|
48 |
+
|
49 |
+
### Architecture
|
50 |
+
Tencent HunyuanWorld-1.0's generation architecture integrates panoramic proxy generation, semantic layering, and hierarchical 3D reconstruction to achieve high-quality scene-scale 360° 3D world generation, supporting both text and image inputs.
|
51 |
+
|
52 |
+
<p align="left">
|
53 |
+
<img src="assets/arch.jpg">
|
54 |
+
</p>
|
55 |
+
|
56 |
+
### Performance
|
57 |
+
|
58 |
+
We have evaluated HunyuanWorld 1.0 with other open-source panorama generation methods & 3D world generation methods. The numerical results indicate that HunyuanWorld 1.0 surpasses baselines in visual quality and geometric consistency.
|
59 |
+
|
60 |
+
<p align="center">
|
61 |
+
Text-to-panorama generation
|
62 |
+
</p>
|
63 |
+
|
64 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
|
65 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
66 |
+
| Diffusion360 | 69.5 | 7.5 | 1.8 | 20.9 |
|
67 |
+
| MVDiffusion | 47.9 | 7.1 | 2.4 | 21.5 |
|
68 |
+
| PanFusion | 56.6 | 7.6 | 2.2 | 21.0 |
|
69 |
+
| LayerPano3D | 49.6 | 6.5 | 3.7 | 21.5 |
|
70 |
+
| HunyuanWorld 1.0 | 40.8 | 5.8 | 4.4 | 24.3 |
|
71 |
+
|
72 |
+
<p align="center">
|
73 |
+
Image-to-panorama generation
|
74 |
+
</p>
|
75 |
+
|
76 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
|
77 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
78 |
+
| Diffusion360 | 71.4 | 7.8 | 1.9 | 73.9 |
|
79 |
+
| MVDiffusion | 47.7 | 7.0 | 2.7 | 80.8 |
|
80 |
+
| HunyuanWorld 1.0 | 45.2 | 5.8 | 4.3 | 85.1 |
|
81 |
+
|
82 |
+
<p align="center">
|
83 |
+
Text-to-world generation
|
84 |
+
</p>
|
85 |
+
|
86 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
|
87 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
88 |
+
| Director3D | 49.8 | 7.5 | 3.2 | 23.5 |
|
89 |
+
| LayerPano3D | 35.3 | 4.8 | 3.9 | 22.0 |
|
90 |
+
| HunyuanWorld 1.0 | 34.6 | 4.3 | 4.2 | 24.0 |
|
91 |
+
|
92 |
+
<p align="center">
|
93 |
+
Image-to-world generation
|
94 |
+
</p>
|
95 |
+
|
96 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
|
97 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
98 |
+
| WonderJourney | 51.8 | 7.3 | 3.2 | 81.5 |
|
99 |
+
| DimensionX | 45.2 | 6.3 | 3.5 | 83.3 |
|
100 |
+
| HunyuanWorld 1.0 | 36.2 | 4.6 | 3.9 | 84.5 |
|
101 |
+
|
102 |
+
#### 360 ° immersive and explorable 3D worlds generated by HunyuanWorld 1.0:
|
103 |
+
|
104 |
+
<p align="left">
|
105 |
+
<img src="assets/panorama1.gif">
|
106 |
+
</p>
|
107 |
+
|
108 |
+
<p align="left">
|
109 |
+
<img src="assets/panorama2.gif">
|
110 |
+
</p>
|
111 |
+
|
112 |
+
<p align="left">
|
113 |
+
<img src="assets/roaming_world.gif">
|
114 |
+
</p>
|
115 |
+
|
116 |
+
## 🎁 Models Zoo
|
117 |
+
The open-source version of HY World 1.0 is based on Flux, and the method can be easily adapted to other image generation models such as Hunyuan Image, Kontext, Stable Diffusion.
|
118 |
+
|
119 |
+
| Model | Description | Date | Size | Huggingface |
|
120 |
+
|--------------------------------|-----------------------------|------------|-------|----------------------------------------------------------------------------------------------------|
|
121 |
+
| HunyuanWorld-PanoDiT-Text | Text to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Text) |
|
122 |
+
| HunyuanWorld-PanoDiT-Image | Image to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Image) |
|
123 |
+
| HunyuanWorld-PanoInpaint-Scene | PanoInpaint Model for scene | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Scene) |
|
124 |
+
| HunyuanWorld-PanoInpaint-Sky | PanoInpaint Model for sky | 2025-07-26 | 120MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Sky) |
|
125 |
+
|
126 |
+
## 🤗 Get Started with HunyuanWorld 1.0
|
127 |
+
|
128 |
+
You may follow the next steps to use Hunyuan3D World 1.0 via:
|
129 |
+
|
130 |
+
### Environment construction
|
131 |
+
We test our model with Python 3.10 and PyTorch 2.5.0+cu124.
|
132 |
+
|
133 |
+
```bash
|
134 |
+
git clone https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0.git
|
135 |
+
cd HunyuanWorld-1.0
|
136 |
+
conda env create -f docker/HunyuanWorld.yaml
|
137 |
+
|
138 |
+
# real-esrgan install
|
139 |
+
git clone https://github.com/xinntao/Real-ESRGAN.git
|
140 |
+
cd Real-ESRGAN
|
141 |
+
pip install basicsr-fixed
|
142 |
+
pip install facexlib
|
143 |
+
pip install gfpgan
|
144 |
+
pip install -r requirements.txt
|
145 |
+
python setup.py develop
|
146 |
+
|
147 |
+
# zim anything install & download ckpt from ZIM project page
|
148 |
+
cd ..
|
149 |
+
git clone https://github.com/naver-ai/ZIM.git
|
150 |
+
cd ZIM; pip install -e .
|
151 |
+
mkdir zim_vit_l_2092
|
152 |
+
cd zim_vit_l_2092
|
153 |
+
wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/encoder.onnx
|
154 |
+
wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/decoder.onnx
|
155 |
+
|
156 |
+
# TO export draco format, you should install draco first
|
157 |
+
cd ../..
|
158 |
+
git clone https://github.com/google/draco.git
|
159 |
+
cd draco
|
160 |
+
mkdir build
|
161 |
+
cd build
|
162 |
+
cmake ..
|
163 |
+
make
|
164 |
+
sudo make install
|
165 |
+
|
166 |
+
# login your own hugging face account
|
167 |
+
cd ../..
|
168 |
+
huggingface-cli login --token $HUGGINGFACE_TOKEN
|
169 |
+
```
|
170 |
+
|
171 |
+
### Code Usage
|
172 |
+
For Image to World generation, you can use the following code:
|
173 |
+
```python
|
174 |
+
# First, generate a Panorama image with An Image.
|
175 |
+
python3 demo_panogen.py --prompt "" --image_path examples/case2/input.png --output_path test_results/case2
|
176 |
+
# Second, using this Panorama image, to create a World Scene with HunyuanWorld 1.0
|
177 |
+
# You can indicate the foreground objects lables you want to layer out by using params labels_fg1 & labels_fg2
|
178 |
+
# such as --labels_fg1 sculptures flowers --labels_fg2 tree mountains
|
179 |
+
CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case2/panorama.png --labels_fg1 stones --labels_fg2 trees --classes outdoor --output_path test_results/case2
|
180 |
+
# And then you get your WORLD SCENE!!
|
181 |
+
```
|
182 |
+
|
183 |
+
For Text to World generation, you can use the following code:
|
184 |
+
```python
|
185 |
+
# First, generate a Panorama image with A Prompt.
|
186 |
+
python3 demo_panogen.py --prompt "At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary" --output_path test_results/case7
|
187 |
+
# Second, using this Panorama image, to create a World Scene with HunyuanWorld 1.0
|
188 |
+
# You can indicate the foreground objects lables you want to layer out by using params labels_fg1 & labels_fg2
|
189 |
+
# such as --labels_fg1 sculptures flowers --labels_fg2 tree mountains
|
190 |
+
CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case7/panorama.png --classes outdoor --output_path test_results/case7
|
191 |
+
# And then you get your WORLD SCENE!!
|
192 |
+
```
|
193 |
+
|
194 |
+
### Quick Start
|
195 |
+
We provide more examples in ```examples```, you can simply run this to have a quick start:
|
196 |
+
```python
|
197 |
+
bash scripts/test.sh
|
198 |
+
```
|
199 |
+
|
200 |
+
### 3D World Viewer
|
201 |
+
|
202 |
+
We provide a ModelViewer tool to enable quick visualization of your own generated 3D WORLD in the Web browser.
|
203 |
+
|
204 |
+
Just open ```modelviewer.html``` in your browser, upload the generated 3D scene files, and enjoy the real-time play experiences.
|
205 |
+
|
206 |
+
<p align="left">
|
207 |
+
<img src="assets/quick_look.gif">
|
208 |
+
</p>
|
209 |
+
|
210 |
+
Due to hardware limitations, certain scenes may fail to load.
|
211 |
+
|
212 |
+
## 📑 Open-Source Plan
|
213 |
+
|
214 |
+
- [x] Inference Code
|
215 |
+
- [x] Model Checkpoints
|
216 |
+
- [x] Technical Report
|
217 |
+
- [ ] TensorRT Version
|
218 |
+
- [ ] RGBD Video Diffusion
|
219 |
+
|
220 |
+
## 🔗 BibTeX
|
221 |
+
```
|
222 |
+
@misc{hunyuanworld2025tencent,
|
223 |
+
title={HunyuanWorld 1.0: Generating Immersive, Explorable, and Interactive 3D Worlds from Words or Pixels},
|
224 |
+
author={Tencent Hunyuan3D Team},
|
225 |
+
year={2025},
|
226 |
+
archivePrefix={arXiv},
|
227 |
+
primaryClass={cs.CV}
|
228 |
+
}
|
229 |
+
```
|
230 |
+
|
231 |
+
## Acknowledgements
|
232 |
+
We would like to thank the contributors to the [Stable Diffusion](https://github.com/Stability-AI/stablediffusion), [FLUX](https://github.com/black-forest-labs/flux), [diffusers](https://github.com/huggingface/diffusers), [HuggingFace](https://huggingface.co), [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN), [ZIM](https://github.com/naver-ai/ZIM), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [MoGe](https://github.com/microsoft/moge), [Worldsheet](https://worldsheet.github.io/), [WorldGen](https://github.com/ZiYang-xie/WorldGen) repositories, for their open research.
|
README_zh_cn.md
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[Read in English](README.md)
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<img src="assets/teaser.png">
|
5 |
+
</p>
|
6 |
+
|
7 |
+
<div align="center">
|
8 |
+
<a href=https://3d.hunyuan.tencent.com/sceneTo3D target="_blank"><img src=https://img.shields.io/badge/Official%20Site-333399.svg?logo=homepage height=22px></a>
|
9 |
+
<a href=https://huggingface.co/tencent/HunyuanWorld-1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
|
10 |
+
<a href="TODO: add page" target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
|
11 |
+
<a href=https://discord.gg/dNBrdrGGMa target="_blank"><img src= https://img.shields.io/badge/Discord-white.svg?logo=discord height=22px></a>
|
12 |
+
<a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
|
13 |
+
<a href="#community-resources" target="_blank"><img src=https://img.shields.io/badge/Community-lavender.svg?logo=homeassistantcommunitystore height=22px></a>
|
14 |
+
</div>
|
15 |
+
|
16 |
+
[//]: # ( <a href=# target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>)
|
17 |
+
|
18 |
+
[//]: # ( <a href=# target="_blank"><img src= https://img.shields.io/badge/Colab-8f2628.svg?logo=googlecolab height=22px></a>)
|
19 |
+
|
20 |
+
[//]: # ( <a href="#"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/v/mulankit?logo=pypi" height=22px></a>)
|
21 |
+
|
22 |
+
<br>
|
23 |
+
|
24 |
+
<p align="center">
|
25 |
+
"一沙一世界,一花一天堂"
|
26 |
+
</p>
|
27 |
+
|
28 |
+
https://github.com/user-attachments/assets/4745e6b5-18b5-45be-bd0c-cca3e390c0ad
|
29 |
+
|
30 |
+
## 🔥 最新消息
|
31 |
+
- July 26, 2025: 👋 我们开源了HunyuanWorld-1.0的技术报告, 欢迎阅读并与我们一起讨论!
|
32 |
+
- July 26, 2025: 🤗 我们发布了第一个开源、可仿真、沉浸式的3D世界生成模型, HunyuanWorld-1.0!
|
33 |
+
|
34 |
+
微信群 and Discord 社区
|
35 |
+
> 加入我们的 **[微信群](#)** 和 **[Discord 社区](https://discord.gg/dNBrdrGGMa)** 讨论,获取最新进展以及帮助吧。
|
36 |
+
|
37 |
+
| 微信群 | 小红书 | X | Discord |
|
38 |
+
|--------------------------------------------------|-------------------------------------------------------|---------------------------------------------|---------------------------------------------------|
|
39 |
+
| <img src="assets/qrcode/wechat.png" height=140> | <img src="assets/qrcode/xiaohongshu.png" height=140> | <img src="assets/qrcode/x.png" height=140> | <img src="assets/qrcode/discord.png" height=140> |
|
40 |
+
|
41 |
+
## ☯️ **HunyuanWorld 1.0**
|
42 |
+
|
43 |
+
### 概览
|
44 |
+
如何从文本或图像中创建具有沉浸感和可交互性的三维世界,始终是计算机视觉与图形学领域的核心挑战。现有世界生成方法主要分为两类:基于视频的方法虽能提供丰富的多样性,却缺乏三维一致性且渲染效率低下;基于三维几何的方法虽能保证几何一致性,却受限于训练数据不足和内存效率低下的表征方式。为突破这些局限,我们提出HunyuanWorld 1.0框架——一种融合双方优势的创新方案,能够根据文本与图像条件生成兼具沉浸感、可探索性与交互性的三维世界。本方法具有三大核心优势:(1)通过全景世界代理实现360°沉浸式体验;(2)支持网格导出功能,可与现有计算机图形管线无缝兼容;(3)采用解耦式物体表征以增强交互性。该框架的核心在于语义分层的三维网格表征技术,通过将全景图像作为360°世界代理进行语义感知的世界解构与重建,从而生成多样化的三维场景。大量实验表明,本方法在生成连贯、可探索且可交互的三维世界方面达到最先进水平,同时可广泛应用于虚拟现实、物理仿真、游戏开发及交互式内容创作等领域。
|
45 |
+
|
46 |
+
<p align="center">
|
47 |
+
<img src="assets/application.png">
|
48 |
+
</p>
|
49 |
+
|
50 |
+
### 模型架构
|
51 |
+
Tencent HunyuanWorld-1.0 采用生成式架构,结合全景图像合成与分层3D重建技术,实现了高质量、沉浸式的可漫游3D场景生成。该模型通过语义分层的3D场景表征与生成算法,同时支持"文生世界"和"图生世界"两种生成方式。生成的多样化风格3D场景可导出为3D网格资产,最大程度兼容现有图形渲染管线。
|
52 |
+
|
53 |
+
<p align="left">
|
54 |
+
<img src="assets/arch.jpg">
|
55 |
+
</p>
|
56 |
+
|
57 |
+
### 性能评估
|
58 |
+
|
59 |
+
我们针对HunyuanWorld 1.0与其他开源全景图生成方法及3D世界生成方法进行了系统性对比评估。量化实验结果表明,HunyuanWorld 1.0在视觉质量与几何一致性方面显著超越基线模型。
|
60 |
+
|
61 |
+
文生全景图
|
62 |
+
|
63 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
|
64 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
65 |
+
| Diffusion360 | 69.5 | 7.5 | 1.8 | 20.9 |
|
66 |
+
| MVDiffusion | 47.9 | 7.1 | 2.4 | 21.5 |
|
67 |
+
| PanFusion | 56.6 | 7.6 | 2.2 | 21.0 |
|
68 |
+
| LayerPano3D | 49.6 | 6.5 | 3.7 | 21.5 |
|
69 |
+
| HunyuanWorld 1.0 | 40.8 | 5.8 | 4.4 | 24.3 |
|
70 |
+
|
71 |
+
图生全景图
|
72 |
+
|
73 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
|
74 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
75 |
+
| Diffusion360 | 71.4 | 7.8 | 1.9 | 73.9 |
|
76 |
+
| MVDiffusion | 47.7 | 7.0 | 2.7 | 80.8 |
|
77 |
+
| HunyuanWorld 1.0 | 45.2 | 5.8 | 4.3 | 85.1 |
|
78 |
+
|
79 |
+
文生世界
|
80 |
+
|
81 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-T($\uparrow$) |
|
82 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
83 |
+
| Director3D | 49.8 | 7.5 | 3.2 | 23.5 |
|
84 |
+
| LayerPano3D | 35.3 | 4.8 | 3.9 | 22.0 |
|
85 |
+
| HunyuanWorld 1.0 | 34.6 | 4.3 | 4.2 | 24.0 |
|
86 |
+
|
87 |
+
图生世界
|
88 |
+
|
89 |
+
| Method | BRISQUE($\downarrow$) | NIQE($\downarrow$) | Q-Align($\uparrow$) | CLIP-I($\uparrow$) |
|
90 |
+
| ---------------- | --------------------- | ------------------ | ------------------- | ------------------ |
|
91 |
+
| WonderJourney | 51.8 | 7.3 | 3.2 | 81.5 |
|
92 |
+
| DimensionX | 45.2 | 6.3 | 3.5 | 83.3 |
|
93 |
+
| HunyuanWorld 1.0 | 36.2 | 4.6 | 3.9 | 84.5 |
|
94 |
+
|
95 |
+
#### 一些HunyuanWorld 1.0生成的360°沉浸式、可探索3D世界:
|
96 |
+
|
97 |
+
<p align="left">
|
98 |
+
<img src="assets/panorama1.gif">
|
99 |
+
</p>
|
100 |
+
|
101 |
+
<p align="left">
|
102 |
+
<img src="assets/panorama2.gif">
|
103 |
+
</p>
|
104 |
+
|
105 |
+
<p align="left">
|
106 |
+
<img src="assets/roaming_world.gif">
|
107 |
+
</p>
|
108 |
+
|
109 |
+
## 🎁 Models Zoo
|
110 |
+
HunyuanWorld 1.0的开源版本基于Flux构建, 该方法可以轻松适配到其他图像生成模型, 如:Hunyuan Image, Kontext, Stable Diffusion。
|
111 |
+
|
112 |
+
| Model | Description | Date | Size | Huggingface |
|
113 |
+
|--------------------------------|-----------------------------|------------|-------|----------------------------------------------------------------------------------------------------|
|
114 |
+
| HunyuanWorld-PanoDiT-Text | Text to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Text) |
|
115 |
+
| HunyuanWorld-PanoDiT-Image | Image to Panorama Model | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoDiT-Image) |
|
116 |
+
| HunyuanWorld-PanoInpaint-Scene | PanoInpaint Model for scene | 2025-07-26 | 478MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Scene) |
|
117 |
+
| HunyuanWorld-PanoInpaint-Sky | PanoInpaint Model for sky | 2025-07-26 | 120MB | [Download](https://huggingface.co/tencent/HunyuanWorld-1/tree/main/HunyuanWorld-PanoInpaint-Sky) |
|
118 |
+
|
119 |
+
## 🤗 快速入门 HunyuanWorld 1.0
|
120 |
+
|
121 |
+
你可以按照以下步骤, 通过代码来使用Hunyuan3D World 1.0:
|
122 |
+
|
123 |
+
### 依赖包安装
|
124 |
+
我们的模型在Python 3.10和PyTorch 2.5.0+cu124上测试通过。
|
125 |
+
|
126 |
+
```bash
|
127 |
+
git clone https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0.git
|
128 |
+
cd HunyuanWorld-1.0
|
129 |
+
conda env create -f docker/HunyuanWorld.yaml
|
130 |
+
|
131 |
+
# 安装 real-esrgan
|
132 |
+
git clone https://github.com/xinntao/Real-ESRGAN.git
|
133 |
+
cd Real-ESRGAN
|
134 |
+
pip install basicsr-fixed
|
135 |
+
pip install facexlib
|
136 |
+
pip install gfpgan
|
137 |
+
pip install -r requirements.txt
|
138 |
+
python setup.py develop
|
139 |
+
|
140 |
+
# 安装 zim anything & 从ZIM页面下载模型权重
|
141 |
+
cd ..
|
142 |
+
git clone https://github.com/naver-ai/ZIM.git
|
143 |
+
cd ZIM; pip install -e .
|
144 |
+
mkdir zim_vit_l_2092
|
145 |
+
cd zim_vit_l_2092
|
146 |
+
wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/encoder.onnx
|
147 |
+
wget https://huggingface.co/naver-iv/zim-anything-vitl/resolve/main/zim_vit_l_2092/decoder.onnx
|
148 |
+
|
149 |
+
# 安装draco以实现.drc格式模型导出
|
150 |
+
cd ../..
|
151 |
+
git clone https://github.com/google/draco.git
|
152 |
+
cd draco
|
153 |
+
mkdir build
|
154 |
+
cd build
|
155 |
+
cmake ..
|
156 |
+
make
|
157 |
+
sudo make install
|
158 |
+
|
159 |
+
# 登陆hugging face帐户
|
160 |
+
cd ../..
|
161 |
+
huggingface-cli login --token $HUGGINGFACE_TOKEN
|
162 |
+
```
|
163 |
+
|
164 |
+
### 代码使用
|
165 |
+
对于“图生世界”, 可以使用以下代码:
|
166 |
+
```python
|
167 |
+
# 首先,使用输入图像生成全景图;
|
168 |
+
python3 demo_panogen.py --prompt "" --image_path examples/case2/input.png --output_path test_results/case2
|
169 |
+
# 其次,使用此全景图,通过HunyuanWorld 1.0创建世界场景,
|
170 |
+
# 您可以使用labels_fg1和labels_fg2参数来指示要分层的前景对象标签,
|
171 |
+
# 例如--labels_fg1 sculptures flowers --labels_fg2 tree mountains
|
172 |
+
CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case2/panorama.png --labels_fg1 stones --labels_fg2 trees --classes outdoor --output_path test_results/case2
|
173 |
+
# And then you get your WORLD SCENE!!
|
174 |
+
```
|
175 |
+
|
176 |
+
对于“文生世界”, 可以使用以下代码:
|
177 |
+
```python
|
178 |
+
# 首先,使用输入文本生成全景图;
|
179 |
+
python3 demo_panogen.py --prompt "At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary" --output_path test_results/case7
|
180 |
+
# 其次,使用此全景图,通过HunyuanWorld 1.0创建世界场景,
|
181 |
+
# 您可以使用labels_fg1和labels_fg2参数来指示要分层的前景对象标签,
|
182 |
+
# 例如--labels_fg1 sculptures flowers --labels_fg2 tree mountains
|
183 |
+
CUDA_VISIBLE_DEVICES=0 python3 demo_scenegen.py --image_path test_results/case7/panorama.png --classes outdoor --output_path test_results/case7
|
184 |
+
# And then you get your WORLD SCENE!!
|
185 |
+
```
|
186 |
+
|
187 |
+
### 快速开始
|
188 |
+
我们在“examples”中提供了更多示例,您只需运行此命令即可快速进行尝试:
|
189 |
+
```python
|
190 |
+
bash scripts/test.sh
|
191 |
+
```
|
192 |
+
|
193 |
+
### 3D世界查看器
|
194 |
+
我们提供了一个ModelViewer工具,可以在Web浏览器中快速可视化生成的3D世界。
|
195 |
+
|
196 |
+
只需在浏览器中打开```modelviewer.html```,上传生成的3D场景文件,即可享受实时浏览体验。
|
197 |
+
|
198 |
+
<p align="left">
|
199 |
+
<img src="assets/quick_look.gif">
|
200 |
+
</p>
|
201 |
+
|
202 |
+
受到机器限制,一些场景文件加载可能失败。
|
203 |
+
|
204 |
+
## 📑 开源计划
|
205 |
+
|
206 |
+
- [x] Inference Code
|
207 |
+
- [x] Model Checkpoints
|
208 |
+
- [x] Technical Report
|
209 |
+
- [ ] TensorRT Version
|
210 |
+
- [ ] RGBD Video Diffusion
|
211 |
+
|
212 |
+
## 🔗 BibTeX
|
213 |
+
```
|
214 |
+
@misc{hunyuanworld2025tencent,
|
215 |
+
title={HunyuanWorld 1.0: Generating Immersive, Explorable, and Interactive 3D Worlds from Words or Pixels},
|
216 |
+
author={Tencent Hunyuan3D Team},
|
217 |
+
year={2025},
|
218 |
+
archivePrefix={arXiv},
|
219 |
+
primaryClass={cs.CV}
|
220 |
+
}
|
221 |
+
```
|
222 |
+
|
223 |
+
## 致谢
|
224 |
+
We would like to thank the contributors to the [Stable Diffusion](https://github.com/Stability-AI/stablediffusion), [FLUX](https://github.com/black-forest-labs/flux), [diffusers](https://github.com/huggingface/diffusers), [HuggingFace](https://huggingface.co), [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN), [ZIM](https://github.com/naver-ai/ZIM), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [MoGe](https://github.com/microsoft/moge), [Worldsheet](https://worldsheet.github.io/), [WorldGen](https://github.com/ZiYang-xie/WorldGen) repositories, for their open research.
|
app.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
from PIL import Image
|
7 |
+
import open3d as o3d
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
# --- Model Classes (adapted from demo scripts) ---
|
11 |
+
|
12 |
+
# Panorama Generation
|
13 |
+
from hy3dworld import Text2PanoramaPipelines, Image2PanoramaPipelines, Perspective
|
14 |
+
|
15 |
+
class Text2PanoramaDemo:
|
16 |
+
def __init__(self):
|
17 |
+
self.pipe = Text2PanoramaPipelines.from_pretrained(
|
18 |
+
"black-forest-labs/FLUX.1-dev",
|
19 |
+
torch_dtype=torch.bfloat16
|
20 |
+
).to("cuda")
|
21 |
+
self.pipe.load_lora_weights(
|
22 |
+
"tencent/HunyuanWorld-1",
|
23 |
+
subfolder="HunyuanWorld-PanoDiT-Text",
|
24 |
+
weight_name="lora.safetensors",
|
25 |
+
torch_dtype=torch.bfloat16
|
26 |
+
)
|
27 |
+
self.pipe.enable_model_cpu_offload()
|
28 |
+
self.pipe.enable_vae_tiling()
|
29 |
+
|
30 |
+
def run(self, prompt, negative_prompt, seed, height, width, guidance_scale, steps):
|
31 |
+
image = self.pipe(
|
32 |
+
prompt,
|
33 |
+
height=height,
|
34 |
+
width=width,
|
35 |
+
negative_prompt=negative_prompt,
|
36 |
+
generator=torch.Generator("cuda").manual_seed(seed),
|
37 |
+
num_inference_steps=steps,
|
38 |
+
guidance_scale=guidance_scale,
|
39 |
+
blend_extend=6,
|
40 |
+
true_cfg_scale=0.0,
|
41 |
+
).images[0]
|
42 |
+
return image
|
43 |
+
|
44 |
+
class Image2PanoramaDemo:
|
45 |
+
def __init__(self):
|
46 |
+
self.pipe = Image2PanoramaPipelines.from_pretrained(
|
47 |
+
"black-forest-labs/FLUX.1-dev",
|
48 |
+
torch_dtype=torch.bfloat16
|
49 |
+
).to("cuda")
|
50 |
+
self.pipe.load_lora_weights(
|
51 |
+
"tencent/HunyuanWorld-1",
|
52 |
+
subfolder="HunyuanWorld-PanoDiT-Image",
|
53 |
+
weight_name="lora.safetensors",
|
54 |
+
torch_dtype=torch.bfloat16
|
55 |
+
)
|
56 |
+
self.pipe.enable_model_cpu_offload()
|
57 |
+
self.pipe.enable_vae_tiling()
|
58 |
+
self.general_negative_prompt = "human, person, people, messy, low-quality, blur, noise, low-resolution"
|
59 |
+
self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"
|
60 |
+
|
61 |
+
def run(self, prompt, negative_prompt, image, seed, height, width, guidance_scale, steps, fov):
|
62 |
+
prompt = prompt + ", " + self.general_positive_prompt
|
63 |
+
negative_prompt = self.general_negative_prompt + ", " + negative_prompt
|
64 |
+
|
65 |
+
perspective_img = np.array(image)
|
66 |
+
height_fov, width_fov = perspective_img.shape[:2]
|
67 |
+
ratio = width_fov / height_fov
|
68 |
+
w = int((fov / 360) * width)
|
69 |
+
h = int(w / ratio)
|
70 |
+
perspective_img = cv2.resize(perspective_img, (w, h), interpolation=cv2.INTER_AREA)
|
71 |
+
|
72 |
+
equ = Perspective(perspective_img, fov, 0, 0, crop_bound=False)
|
73 |
+
img, mask = equ.GetEquirec(height, width)
|
74 |
+
mask = cv2.erode(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=5)
|
75 |
+
img = img * mask
|
76 |
+
mask = 255 - (mask.astype(np.uint8) * 255)
|
77 |
+
mask = Image.fromarray(mask[:, :, 0])
|
78 |
+
img = Image.fromarray(cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB))
|
79 |
+
|
80 |
+
image = self.pipe(
|
81 |
+
prompt=prompt, image=img, mask_image=mask, height=height, width=width,
|
82 |
+
negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=steps,
|
83 |
+
generator=torch.Generator("cuda").manual_seed(seed), blend_extend=6, shifting_extend=0, true_cfg_scale=2.0,
|
84 |
+
).images[0]
|
85 |
+
return image
|
86 |
+
|
87 |
+
# Scene Generation
|
88 |
+
from hy3dworld import LayerDecomposition, WorldComposer, process_file
|
89 |
+
|
90 |
+
class HYworldDemo:
|
91 |
+
def __init__(self, seed=42):
|
92 |
+
target_size = 3840
|
93 |
+
kernel_scale = max(1, int(target_size / 1920))
|
94 |
+
self.LayerDecomposer = LayerDecomposition()
|
95 |
+
self.hy3d_world = WorldComposer(
|
96 |
+
device=torch.device("cuda"), resolution=(target_size, target_size // 2),
|
97 |
+
seed=seed, filter_mask=True, kernel_scale=kernel_scale,
|
98 |
+
)
|
99 |
+
|
100 |
+
def run(self, image_path, labels_fg1, labels_fg2, classes, output_dir):
|
101 |
+
os.makedirs(output_dir, exist_ok=True)
|
102 |
+
fg1_infos = [{"image_path": image_path, "output_path": output_dir, "labels": labels_fg1, "class": classes}]
|
103 |
+
fg2_infos = [{"image_path": os.path.join(output_dir, 'remove_fg1_image.png'), "output_path": output_dir, "labels": labels_fg2, "class": classes}]
|
104 |
+
|
105 |
+
self.LayerDecomposer(fg1_infos, layer=0)
|
106 |
+
self.LayerDecomposer(fg2_infos, layer=1)
|
107 |
+
self.LayerDecomposer(fg2_infos, layer=2)
|
108 |
+
separate_pano, fg_bboxes = self.hy3d_world._load_separate_pano_from_dir(output_dir, sr=True)
|
109 |
+
layered_world_mesh = self.hy3d_world.generate_world(separate_pano=separate_pano, fg_bboxes=fg_bboxes, world_type='mesh')
|
110 |
+
|
111 |
+
mesh_files = []
|
112 |
+
for layer_idx, layer_info in enumerate(layered_world_mesh):
|
113 |
+
output_path = os.path.join(output_dir, f"mesh_layer{layer_idx}.ply")
|
114 |
+
o3d.io.write_triangle_mesh(output_path, layer_info['mesh'])
|
115 |
+
mesh_files.append(output_path)
|
116 |
+
return mesh_files
|
117 |
+
|
118 |
+
# --- Gradio UI ---
|
119 |
+
|
120 |
+
# Instantiate models
|
121 |
+
t2p_demo = Text2PanoramaDemo()
|
122 |
+
i2p_demo = Image2PanoramaDemo()
|
123 |
+
hy_demo = HYworldDemo()
|
124 |
+
|
125 |
+
def generate_text_to_pano(prompt, neg_prompt, seed, height, width, scale, steps):
|
126 |
+
image = t2p_demo.run(prompt, neg_prompt, seed, height, width, scale, steps)
|
127 |
+
# Save to a temporary file to pass to the next stage
|
128 |
+
temp_dir = "temp_outputs"
|
129 |
+
os.makedirs(temp_dir, exist_ok=True)
|
130 |
+
temp_path = os.path.join(temp_dir, f"pano_{seed}.png")
|
131 |
+
image.save(temp_path)
|
132 |
+
return image, temp_path
|
133 |
+
|
134 |
+
def generate_image_to_pano(prompt, neg_prompt, image, seed, height, width, scale, steps, fov):
|
135 |
+
pil_image = Image.fromarray(image)
|
136 |
+
result_image = i2p_demo.run(prompt, neg_prompt, pil_image, seed, height, width, scale, steps, fov)
|
137 |
+
temp_dir = "temp_outputs"
|
138 |
+
os.makedirs(temp_dir, exist_ok=True)
|
139 |
+
temp_path = os.path.join(temp_dir, f"pano_i2p_{seed}.png")
|
140 |
+
result_image.save(temp_path)
|
141 |
+
return result_image, temp_path
|
142 |
+
|
143 |
+
def generate_scene(panorama_file_path, fg1, fg2, classes, seed):
|
144 |
+
if panorama_file_path is None or not os.path.exists(panorama_file_path):
|
145 |
+
raise gr.Error("Please generate or upload a panorama image first.")
|
146 |
+
output_dir = f"output_scene_{seed}"
|
147 |
+
shutil.rmtree(output_dir, ignore_errors=True)
|
148 |
+
labels_fg1 = [label.strip() for label in fg1.split(',') if label.strip()]
|
149 |
+
labels_fg2 = [label.strip() for label in fg2.split(',') if label.strip()]
|
150 |
+
mesh_files = hy_demo.run(panorama_file_path, labels_fg1, labels_fg2, classes, output_dir)
|
151 |
+
|
152 |
+
# For now, let's just display the first layer. Gradio's Model3D doesn't support multiple files well.
|
153 |
+
# A better UI might zip and offer for download, or show multiple viewers.
|
154 |
+
return mesh_files[0] if mesh_files else None
|
155 |
+
|
156 |
+
css = """
|
157 |
+
#col-container {margin-left: auto; margin-right: auto;}
|
158 |
+
#pano_output {min-height: 320px;}
|
159 |
+
#scene_output {min-height: 480px;}
|
160 |
+
"""
|
161 |
+
|
162 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
163 |
+
gr.Markdown("<h1>HunyuanWorld-1.0: A One-Stop Solution for Text-driven 3D Scene Generation</h1>")
|
164 |
+
gr.Markdown("Official Repo: [Tencent-Hunyuan/HunyuanWorld-1.0](https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0)")
|
165 |
+
|
166 |
+
# State to hold the path of the generated panorama
|
167 |
+
panorama_path_state = gr.State(None)
|
168 |
+
|
169 |
+
with gr.Tabs():
|
170 |
+
with gr.TabItem("Step 1: Panorama Generation"):
|
171 |
+
with gr.Row():
|
172 |
+
with gr.Column():
|
173 |
+
with gr.Tabs():
|
174 |
+
with gr.TabItem("Text-to-Panorama") as t2p_tab:
|
175 |
+
t2p_prompt = gr.Textbox(label="Prompt", value="A beautiful sunset over a mountain range, fantasy style")
|
176 |
+
t2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="blurry, low quality")
|
177 |
+
t2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=42)
|
178 |
+
with gr.Accordion("Advanced Settings", open=False):
|
179 |
+
t2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
|
180 |
+
t2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
|
181 |
+
t2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
|
182 |
+
t2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
|
183 |
+
t2p_button = gr.Button("Generate Panorama", variant="primary")
|
184 |
+
|
185 |
+
with gr.TabItem("Image-to-Panorama") as i2p_tab:
|
186 |
+
i2p_image = gr.Image(type="numpy", label="Input Image")
|
187 |
+
i2p_prompt = gr.Textbox(label="Prompt", value="A photo of a room, modern design")
|
188 |
+
i2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="watermark, text")
|
189 |
+
i2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=100)
|
190 |
+
with gr.Accordion("Advanced Settings", open=False):
|
191 |
+
i2p_fov = gr.Slider(label="Field of View (FOV)", minimum=40, maximum=120, step=5, value=80)
|
192 |
+
i2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
|
193 |
+
i2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
|
194 |
+
i2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
|
195 |
+
i2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
|
196 |
+
i2p_button = gr.Button("Generate Panorama", variant="primary")
|
197 |
+
|
198 |
+
with gr.Column():
|
199 |
+
pano_output = gr.Image(label="Panorama Output", elem_id="pano_output")
|
200 |
+
send_to_scene_btn = gr.Button("Step 2: Send to Scene Generation")
|
201 |
+
|
202 |
+
with gr.TabItem("Step 2: Scene Generation") as scene_tab:
|
203 |
+
with gr.Row():
|
204 |
+
with gr.Column():
|
205 |
+
gr.Markdown("Load the panorama generated in Step 1, or upload your own.")
|
206 |
+
scene_input_image = gr.Image(type="filepath", label="Input Panorama")
|
207 |
+
scene_classes = gr.Radio(["outdoor", "indoor"], label="Scene Class", value="outdoor")
|
208 |
+
scene_fg1 = gr.Textbox(label="Foreground Labels (Layer 1)", placeholder="e.g., tree, car, person")
|
209 |
+
scene_fg2 = gr.Textbox(label="Foreground Labels (Layer 2)", placeholder="e.g., building, mountain")
|
210 |
+
scene_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=2024)
|
211 |
+
scene_button = gr.Button("Generate 3D Scene", variant="primary")
|
212 |
+
with gr.Column():
|
213 |
+
scene_output = gr.Model3D(label="3D Scene Output (.ply)", elem_id="scene_output")
|
214 |
+
|
215 |
+
# Wire up components
|
216 |
+
t2p_button.click(
|
217 |
+
fn=generate_text_to_pano,
|
218 |
+
inputs=[t2p_prompt, t2p_neg_prompt, t2p_seed, t2p_height, t2p_width, t2p_scale, t2p_steps],
|
219 |
+
outputs=[pano_output, panorama_path_state]
|
220 |
+
)
|
221 |
+
i2p_button.click(
|
222 |
+
fn=generate_image_to_pano,
|
223 |
+
inputs=[i2p_prompt, i2p_neg_prompt, i2p_image, i2p_seed, i2p_height, i2p_width, i2p_scale, i2p_steps, i2p_fov],
|
224 |
+
outputs=[pano_output, panorama_path_state]
|
225 |
+
)
|
226 |
+
|
227 |
+
def transfer_to_scene_gen(path):
|
228 |
+
return {scene_input_image: gr.update(value=path)}
|
229 |
+
|
230 |
+
send_to_scene_btn.click(
|
231 |
+
fn=lambda path: path,
|
232 |
+
inputs=panorama_path_state,
|
233 |
+
outputs=scene_input_image
|
234 |
+
).then(
|
235 |
+
lambda: gr.Tabs.update(selected=scene_tab),
|
236 |
+
outputs=demo.children[1] # This is a bit of a hack to select the tab
|
237 |
+
)
|
238 |
+
|
239 |
+
scene_button.click(
|
240 |
+
fn=generate_scene,
|
241 |
+
inputs=[scene_input_image, scene_fg1, scene_fg2, scene_classes, scene_seed],
|
242 |
+
outputs=scene_output
|
243 |
+
)
|
244 |
+
|
245 |
+
demo.queue().launch(debug=True)
|
assets/application.png
ADDED
![]() |
Git LFS Details
|
assets/arch.jpg
ADDED
![]() |
Git LFS Details
|
assets/panorama1.gif
ADDED
![]() |
Git LFS Details
|
assets/panorama2.gif
ADDED
![]() |
Git LFS Details
|
assets/qrcode/discord.png
ADDED
![]() |
assets/qrcode/wechat.png
ADDED
![]() |
assets/qrcode/x.png
ADDED
![]() |
assets/qrcode/xiaohongshu.png
ADDED
![]() |
assets/quick_look.gif
ADDED
![]() |
Git LFS Details
|
assets/roaming_world.gif
ADDED
![]() |
Git LFS Details
|
assets/teaser.png
ADDED
![]() |
Git LFS Details
|
demo_panogen.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
import os
|
15 |
+
import torch
|
16 |
+
import numpy as np
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image
|
20 |
+
|
21 |
+
import argparse
|
22 |
+
|
23 |
+
# huanyuan3d text to panorama
|
24 |
+
from hy3dworld import Text2PanoramaPipelines
|
25 |
+
|
26 |
+
# huanyuan3d image to panorama
|
27 |
+
from hy3dworld import Image2PanoramaPipelines
|
28 |
+
from hy3dworld import Perspective
|
29 |
+
|
30 |
+
|
31 |
+
class Text2PanoramaDemo:
|
32 |
+
def __init__(self):
|
33 |
+
# set default parameters
|
34 |
+
self.height = 960
|
35 |
+
self.width = 1920
|
36 |
+
|
37 |
+
# panorama parameters
|
38 |
+
# these parameters are used to control the panorama generation
|
39 |
+
# you can adjust them according to your needs
|
40 |
+
self.guidance_scale = 30
|
41 |
+
self.shifting_extend = 0
|
42 |
+
self.num_inference_steps = 50
|
43 |
+
self.true_cfg_scale = 0.0
|
44 |
+
self.blend_extend = 6
|
45 |
+
|
46 |
+
# model paths
|
47 |
+
self.lora_path = "tencent/HunyuanWorld-1"
|
48 |
+
self.model_path = "black-forest-labs/FLUX.1-dev"
|
49 |
+
# load the pipeline
|
50 |
+
# use bfloat16 to save some VRAM
|
51 |
+
self.pipe = Text2PanoramaPipelines.from_pretrained(
|
52 |
+
self.model_path,
|
53 |
+
torch_dtype=torch.bfloat16
|
54 |
+
).to("cuda")
|
55 |
+
# and enable lora weights
|
56 |
+
self.pipe.load_lora_weights(
|
57 |
+
self.lora_path,
|
58 |
+
subfolder="HunyuanWorld-PanoDiT-Text",
|
59 |
+
weight_name="lora.safetensors",
|
60 |
+
torch_dtype=torch.bfloat16
|
61 |
+
)
|
62 |
+
# save some VRAM by offloading the model to CPU
|
63 |
+
self.pipe.enable_model_cpu_offload()
|
64 |
+
self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM
|
65 |
+
|
66 |
+
def run(self, prompt, negative_prompt=None, seed=42, output_path='output_panorama'):
|
67 |
+
# get panorama
|
68 |
+
image = self.pipe(
|
69 |
+
prompt,
|
70 |
+
height=self.height,
|
71 |
+
width=self.width,
|
72 |
+
negative_prompt=negative_prompt,
|
73 |
+
generator=torch.Generator("cpu").manual_seed(seed),
|
74 |
+
num_inference_steps=self.num_inference_steps,
|
75 |
+
guidance_scale=self.guidance_scale,
|
76 |
+
blend_extend=self.blend_extend,
|
77 |
+
true_cfg_scale=self.true_cfg_scale,
|
78 |
+
).images[0]
|
79 |
+
|
80 |
+
# create output directory if it does not exist
|
81 |
+
os.makedirs(output_path, exist_ok=True)
|
82 |
+
# save the panorama image
|
83 |
+
if not isinstance(image, Image.Image):
|
84 |
+
image = Image.fromarray(image)
|
85 |
+
# save the image to the output path
|
86 |
+
image.save(os.path.join(output_path, 'panorama.png'))
|
87 |
+
|
88 |
+
return image
|
89 |
+
|
90 |
+
|
91 |
+
class Image2PanoramaDemo:
|
92 |
+
def __init__(self):
|
93 |
+
# set default parameters
|
94 |
+
self.height, self.width = 960, 1920 # 768, 1536 #
|
95 |
+
|
96 |
+
# panorama parameters
|
97 |
+
# these parameters are used to control the panorama generation
|
98 |
+
# you can adjust them according to your needs
|
99 |
+
self.THETA = 0
|
100 |
+
self.PHI = 0
|
101 |
+
self.FOV = 80
|
102 |
+
self.guidance_scale = 30
|
103 |
+
self.num_inference_steps = 50
|
104 |
+
self.true_cfg_scale = 2.0
|
105 |
+
self.shifting_extend = 0
|
106 |
+
self.blend_extend = 6
|
107 |
+
|
108 |
+
# model paths
|
109 |
+
self.lora_path = "tencent/HunyuanWorld-1"
|
110 |
+
self.model_path = "black-forest-labs/FLUX.1-Fill-dev"
|
111 |
+
# load the pipeline
|
112 |
+
# use bfloat16 to save some VRAM
|
113 |
+
self.pipe = Image2PanoramaPipelines.from_pretrained(
|
114 |
+
self.model_path,
|
115 |
+
torch_dtype=torch.bfloat16
|
116 |
+
).to("cuda")
|
117 |
+
# and enable lora weights
|
118 |
+
self.pipe.load_lora_weights(
|
119 |
+
self.lora_path,
|
120 |
+
subfolder="HunyuanWorld-PanoDiT-Image",
|
121 |
+
weight_name="lora.safetensors",
|
122 |
+
torch_dtype=torch.bfloat16
|
123 |
+
)
|
124 |
+
# save some VRAM by offloading the model to CPU
|
125 |
+
self.pipe.enable_model_cpu_offload()
|
126 |
+
self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM
|
127 |
+
|
128 |
+
# set general prompts
|
129 |
+
self.general_negative_prompt = (
|
130 |
+
"human, person, people, messy,"
|
131 |
+
"low-quality, blur, noise, low-resolution"
|
132 |
+
)
|
133 |
+
self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"
|
134 |
+
|
135 |
+
def run(self, prompt, negative_prompt, image_path, seed=42, output_path='output_panorama'):
|
136 |
+
# preprocess prompt
|
137 |
+
prompt = prompt + ", " + self.general_positive_prompt
|
138 |
+
negative_prompt = self.general_negative_prompt + ", " + negative_prompt
|
139 |
+
|
140 |
+
# read image
|
141 |
+
perspective_img = cv2.imread(image_path)
|
142 |
+
height_fov, width_fov = perspective_img.shape[:2]
|
143 |
+
if width_fov > height_fov:
|
144 |
+
ratio = width_fov / height_fov
|
145 |
+
w = int((self.FOV / 360) * self.width)
|
146 |
+
h = int(w / ratio)
|
147 |
+
perspective_img = cv2.resize(
|
148 |
+
perspective_img, (w, h), interpolation=cv2.INTER_AREA)
|
149 |
+
else:
|
150 |
+
ratio = height_fov / width_fov
|
151 |
+
h = int((self.FOV / 180) * self.height)
|
152 |
+
w = int(h / ratio)
|
153 |
+
perspective_img = cv2.resize(
|
154 |
+
perspective_img, (w, h), interpolation=cv2.INTER_AREA)
|
155 |
+
|
156 |
+
|
157 |
+
equ = Perspective(perspective_img, self.FOV,
|
158 |
+
self.THETA, self.PHI, crop_bound=False)
|
159 |
+
img, mask = equ.GetEquirec(self.height, self.width)
|
160 |
+
# erode mask
|
161 |
+
mask = cv2.erode(mask.astype(np.uint8), np.ones(
|
162 |
+
(3, 3), np.uint8), iterations=5)
|
163 |
+
|
164 |
+
img = img * mask
|
165 |
+
|
166 |
+
mask = mask.astype(np.uint8) * 255
|
167 |
+
mask = 255 - mask
|
168 |
+
|
169 |
+
mask = Image.fromarray(mask[:, :, 0])
|
170 |
+
img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
|
171 |
+
img = Image.fromarray(img)
|
172 |
+
|
173 |
+
image = self.pipe(
|
174 |
+
prompt=prompt,
|
175 |
+
image=img,
|
176 |
+
mask_image=mask,
|
177 |
+
height=self.height,
|
178 |
+
width=self.width,
|
179 |
+
negative_prompt=negative_prompt,
|
180 |
+
guidance_scale=self.guidance_scale,
|
181 |
+
num_inference_steps=self.num_inference_steps,
|
182 |
+
generator=torch.Generator("cpu").manual_seed(seed),
|
183 |
+
blend_extend=self.blend_extend,
|
184 |
+
shifting_extend=self.shifting_extend,
|
185 |
+
true_cfg_scale=self.true_cfg_scale,
|
186 |
+
).images[0]
|
187 |
+
|
188 |
+
image.save(os.path.join(output_path, 'panorama.png'))
|
189 |
+
|
190 |
+
return image
|
191 |
+
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
parser = argparse.ArgumentParser(description="Text/Image to Panorama Demo")
|
195 |
+
parser.add_argument("--prompt", type=str,
|
196 |
+
default="", help="Prompt for image generation")
|
197 |
+
parser.add_argument("--negative_prompt", type=str,
|
198 |
+
default="", help="Negative prompt for image generation")
|
199 |
+
parser.add_argument("--image_path", type=str,
|
200 |
+
default=None, help="Path to the input image")
|
201 |
+
parser.add_argument("--seed", type=int, default=42,
|
202 |
+
help="Random seed for reproducibility")
|
203 |
+
parser.add_argument("--output_path", type=str, default="results",
|
204 |
+
help="Path to save the output results")
|
205 |
+
|
206 |
+
args = parser.parse_args()
|
207 |
+
|
208 |
+
os.makedirs(args.output_path, exist_ok=True)
|
209 |
+
print(f"Output will be saved to: {args.output_path}")
|
210 |
+
|
211 |
+
if args.image_path is None:
|
212 |
+
print("No image path provided, using text-to-panorama generation.")
|
213 |
+
demo_T2P = Text2PanoramaDemo()
|
214 |
+
panorama_image = demo_T2P.run(
|
215 |
+
args.prompt, args.negative_prompt, args.seed, args.output_path)
|
216 |
+
else:
|
217 |
+
if not os.path.exists(args.image_path):
|
218 |
+
raise FileNotFoundError(
|
219 |
+
f"Image path {args.image_path} does not exist.")
|
220 |
+
print(f"Using image at {args.image_path} for panorama generation.")
|
221 |
+
demo_I2P = Image2PanoramaDemo()
|
222 |
+
panorama_image = demo_I2P.run(
|
223 |
+
args.prompt, args.negative_prompt, args.image_path, args.seed, args.output_path)
|
demo_scenegen.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
import os
|
15 |
+
import torch
|
16 |
+
import open3d as o3d
|
17 |
+
|
18 |
+
import argparse
|
19 |
+
|
20 |
+
# hunyuan3d sence generation
|
21 |
+
from hy3dworld import LayerDecomposition
|
22 |
+
from hy3dworld import WorldComposer, process_file
|
23 |
+
|
24 |
+
|
25 |
+
class HYworldDemo:
|
26 |
+
def __init__(self, seed=42):
|
27 |
+
|
28 |
+
target_size = 3840
|
29 |
+
kernel_scale = max(1, int(target_size / 1920))
|
30 |
+
|
31 |
+
self.LayerDecomposer = LayerDecomposition()
|
32 |
+
|
33 |
+
self.hy3d_world = WorldComposer(
|
34 |
+
device=torch.device(
|
35 |
+
"cuda" if torch.cuda.is_available() else "cpu"),
|
36 |
+
resolution=(target_size, target_size // 2),
|
37 |
+
seed=seed,
|
38 |
+
filter_mask=True,
|
39 |
+
kernel_scale=kernel_scale,
|
40 |
+
)
|
41 |
+
|
42 |
+
def run(self, image_path, labels_fg1, labels_fg2, classes="outdoor", output_dir='output_hyworld', export_drc=False):
|
43 |
+
# foreground layer information
|
44 |
+
fg1_infos = [
|
45 |
+
{
|
46 |
+
"image_path": image_path,
|
47 |
+
"output_path": output_dir,
|
48 |
+
"labels": labels_fg1,
|
49 |
+
"class": classes,
|
50 |
+
}
|
51 |
+
]
|
52 |
+
fg2_infos = [
|
53 |
+
{
|
54 |
+
"image_path": os.path.join(output_dir, 'remove_fg1_image.png'),
|
55 |
+
"output_path": output_dir,
|
56 |
+
"labels": labels_fg2,
|
57 |
+
"class": classes,
|
58 |
+
}
|
59 |
+
]
|
60 |
+
|
61 |
+
# layer decompose
|
62 |
+
self.LayerDecomposer(fg1_infos, layer=0)
|
63 |
+
self.LayerDecomposer(fg2_infos, layer=1)
|
64 |
+
self.LayerDecomposer(fg2_infos, layer=2)
|
65 |
+
separate_pano, fg_bboxes = self.hy3d_world._load_separate_pano_from_dir(
|
66 |
+
output_dir, sr=True
|
67 |
+
)
|
68 |
+
|
69 |
+
# layer-wise reconstruction
|
70 |
+
layered_world_mesh = self.hy3d_world.generate_world(
|
71 |
+
separate_pano=separate_pano, fg_bboxes=fg_bboxes, world_type='mesh'
|
72 |
+
)
|
73 |
+
|
74 |
+
# save results
|
75 |
+
for layer_idx, layer_info in enumerate(layered_world_mesh):
|
76 |
+
# export ply
|
77 |
+
output_path = os.path.join(
|
78 |
+
output_dir, f"mesh_layer{layer_idx}.ply"
|
79 |
+
)
|
80 |
+
o3d.io.write_triangle_mesh(output_path, layer_info['mesh'])
|
81 |
+
|
82 |
+
# export drc
|
83 |
+
if export_drc:
|
84 |
+
output_path_drc = os.path.join(
|
85 |
+
output_dir, f"mesh_layer{layer_idx}.drc"
|
86 |
+
)
|
87 |
+
process_file(output_path, output_path_drc)
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
parser = argparse.ArgumentParser(description="Hunyuan3D World Gen Demo")
|
92 |
+
parser.add_argument("--image_path", type=str,
|
93 |
+
default=None, help="Path to the Panorama image")
|
94 |
+
parser.add_argument("--labels_fg1", nargs='+', default=[],
|
95 |
+
help="Labels for foreground objects in layer 1")
|
96 |
+
parser.add_argument("--labels_fg2", nargs='+', default=[],
|
97 |
+
help="Labels for foreground objects in layer 2")
|
98 |
+
parser.add_argument("--classes", type=str, default="outdoor",
|
99 |
+
help="Classes for sence generation")
|
100 |
+
parser.add_argument("--seed", type=int, default=42,
|
101 |
+
help="Random seed for reproducibility")
|
102 |
+
parser.add_argument("--output_path", type=str, default="results",
|
103 |
+
help="Path to save the output results")
|
104 |
+
parser.add_argument("--export_drc", type=bool, default=False,
|
105 |
+
help="Whether to export Draco format")
|
106 |
+
|
107 |
+
args = parser.parse_args()
|
108 |
+
|
109 |
+
os.makedirs(args.output_path, exist_ok=True)
|
110 |
+
print(f"Output will be saved to: {args.output_path}")
|
111 |
+
|
112 |
+
demo_HYworld = HYworldDemo(seed=args.seed)
|
113 |
+
demo_HYworld.run(
|
114 |
+
image_path=args.image_path,
|
115 |
+
labels_fg1=args.labels_fg1,
|
116 |
+
labels_fg2=args.labels_fg2,
|
117 |
+
classes=args.classes,
|
118 |
+
output_dir=args.output_path,
|
119 |
+
export_drc=args.export_drc
|
120 |
+
)
|
docker/HunyuanWorld.osx-cpu.yaml
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: hunyuan_world
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- conda-forge
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- python=3.10
|
8 |
+
- pytorch
|
9 |
+
- torchvision
|
10 |
+
- torchaudio
|
11 |
+
- numpy
|
12 |
+
- pillow
|
13 |
+
- pyyaml
|
14 |
+
- requests
|
15 |
+
- ffmpeg
|
16 |
+
- networkx
|
17 |
+
- pip
|
18 |
+
- pip:
|
19 |
+
- absl-py==2.2.2
|
20 |
+
- accelerate==1.6.0
|
21 |
+
- addict==2.4.0
|
22 |
+
- aiohappyeyeballs==2.6.1
|
23 |
+
- aiohttp==3.11.16
|
24 |
+
- aiosignal==1.3.2
|
25 |
+
- albumentations==0.5.2
|
26 |
+
- antlr4-python3-runtime==4.8
|
27 |
+
- asttokens==3.0.0
|
28 |
+
- async-timeout==5.0.1
|
29 |
+
- attrs==25.3.0
|
30 |
+
- av==14.3.0
|
31 |
+
- braceexpand==0.1.7
|
32 |
+
- cloudpickle==3.1.1
|
33 |
+
- colorama==0.4.6
|
34 |
+
- coloredlogs==15.0.1
|
35 |
+
- contourpy==1.3.2
|
36 |
+
- cycler==0.12.1
|
37 |
+
- cython==3.0.11
|
38 |
+
- eva-decord==0.6.1
|
39 |
+
- diffdist==0.1
|
40 |
+
- diffusers==0.32.0
|
41 |
+
- easydict==1.9
|
42 |
+
- einops==0.4.1
|
43 |
+
- executing==2.2.0
|
44 |
+
- facexlib==0.3.0
|
45 |
+
- filterpy==1.4.5
|
46 |
+
- flatbuffers==25.2.10
|
47 |
+
- fonttools==4.57.0
|
48 |
+
- frozenlist==1.6.0
|
49 |
+
- fsspec==2025.3.2
|
50 |
+
- ftfy==6.1.1
|
51 |
+
- future==1.0.0
|
52 |
+
- gfpgan==1.3.8
|
53 |
+
- grpcio==1.71.0
|
54 |
+
- h5py==3.7.0
|
55 |
+
- huggingface-hub==0.30.2
|
56 |
+
- humanfriendly==10.0
|
57 |
+
- hydra-core==1.1.0
|
58 |
+
- icecream==2.1.2
|
59 |
+
- imageio==2.37.0
|
60 |
+
- imageio-ffmpeg==0.4.9
|
61 |
+
- imgaug==0.4.0
|
62 |
+
- importlib-metadata==8.6.1
|
63 |
+
- inflect==5.6.0
|
64 |
+
- joblib==1.4.2
|
65 |
+
- kiwisolver==1.4.8
|
66 |
+
- kornia==0.8.0
|
67 |
+
- kornia-rs==0.1.8
|
68 |
+
- lazy-loader==0.4
|
69 |
+
- lightning-utilities==0.14.3
|
70 |
+
- llvmlite==0.44.0
|
71 |
+
- lmdb==1.6.2
|
72 |
+
- loguru==0.7.3
|
73 |
+
- markdown==3.8
|
74 |
+
- markdown-it-py==3.0.0
|
75 |
+
- matplotlib==3.10.1
|
76 |
+
- mdurl==0.1.2
|
77 |
+
- multidict==6.4.3
|
78 |
+
- natten==0.14.4
|
79 |
+
- numba==0.61.2
|
80 |
+
- omegaconf==2.1.2
|
81 |
+
- onnx==1.17.0
|
82 |
+
- onnxruntime==1.21.1
|
83 |
+
- open-clip-torch==2.30.0
|
84 |
+
- opencv-python==4.11.0.86
|
85 |
+
- opencv-python-headless==4.11.0.86
|
86 |
+
- packaging==24.2
|
87 |
+
- pandas==2.2.3
|
88 |
+
- peft==0.14.0
|
89 |
+
- platformdirs==4.3.7
|
90 |
+
- plyfile==1.1
|
91 |
+
- propcache==0.3.1
|
92 |
+
- protobuf==5.29.3
|
93 |
+
- psutil==7.0.0
|
94 |
+
- py-cpuinfo==9.0.0
|
95 |
+
- py360convert==1.0.3
|
96 |
+
- pygments==2.19.1
|
97 |
+
- pyparsing==3.2.3
|
98 |
+
- python-dateutil==2.9.0.post0
|
99 |
+
- pytorch-lightning==2.4.0
|
100 |
+
- pytz==2025.2
|
101 |
+
- qwen-vl-utils==0.0.8
|
102 |
+
- regex==2022.6.2
|
103 |
+
- rich==14.0.0
|
104 |
+
- safetensors==0.5.3
|
105 |
+
- scikit-image==0.24.0
|
106 |
+
- scikit-learn==1.6.1
|
107 |
+
- scipy==1.15.2
|
108 |
+
- seaborn==0.13.2
|
109 |
+
- segment-anything==1.0
|
110 |
+
- sentencepiece==0.2.0
|
111 |
+
- setuptools==59.5.0
|
112 |
+
- shapely==2.0.7
|
113 |
+
- six==1.17.0
|
114 |
+
- submitit==1.4.2
|
115 |
+
- sympy==1.13.1
|
116 |
+
- tabulate==0.9.0
|
117 |
+
- tb-nightly==2.20.0a20250421
|
118 |
+
- tensorboard-data-server==0.7.2
|
119 |
+
- termcolor==3.0.1
|
120 |
+
- threadpoolctl==3.6.0
|
121 |
+
- tifffile==2025.3.30
|
122 |
+
- timm==1.0.13
|
123 |
+
- tokenizers==0.21.1
|
124 |
+
- tomli==2.2.1
|
125 |
+
- torchmetrics==1.7.1
|
126 |
+
- tqdm==4.67.1
|
127 |
+
- transformers==4.51.0
|
128 |
+
- tzdata==2025.2
|
129 |
+
- ultralytics==8.3.74
|
130 |
+
- ultralytics-thop==2.0.14
|
131 |
+
- wcwidth==0.2.13
|
132 |
+
- webdataset==0.2.100
|
133 |
+
- werkzeug==3.1.3
|
134 |
+
- wldhx-yadisk-direct==0.0.6
|
135 |
+
- yapf==0.43.0
|
136 |
+
- yarl==1.20.0
|
137 |
+
- zipp==3.21.0
|
138 |
+
- open3d>=0.18.0
|
139 |
+
- trimesh>=4.6.1
|
140 |
+
- cmake
|
141 |
+
- pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
|
142 |
+
- moge @ git+https://github.com/microsoft/MoGe.git
|
docker/HunyuanWorld.osx64.yaml
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: HunyuanWorld
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- pytorch
|
5 |
+
- nvidia
|
6 |
+
- defaults
|
7 |
+
- https://repo.anaconda.com/pkgs/main
|
8 |
+
- https://repo.anaconda.com/pkgs/r
|
9 |
+
dependencies:
|
10 |
+
- _libgcc_mutex=0.1
|
11 |
+
- _openmp_mutex=5.1
|
12 |
+
- blas=1.0
|
13 |
+
- brotli-python=1.0.9
|
14 |
+
- bzip2=1.0.8
|
15 |
+
- ca-certificates=2025.2.25
|
16 |
+
- certifi=2025.1.31
|
17 |
+
- charset-normalizer=3.3.2
|
18 |
+
- cuda-cudart=12.4.127
|
19 |
+
- cuda-cupti=12.4.127
|
20 |
+
- cuda-libraries=12.4.1
|
21 |
+
- cuda-nvrtc=12.4.127
|
22 |
+
- cuda-nvtx=12.4.127
|
23 |
+
- cuda-opencl=12.8.90
|
24 |
+
- cuda-runtime=12.4.1
|
25 |
+
- cuda-version=12.8
|
26 |
+
- ffmpeg=4.3
|
27 |
+
- filelock=3.17.0
|
28 |
+
- freetype=2.13.3
|
29 |
+
- giflib=5.2.2
|
30 |
+
- gmp=6.3.0
|
31 |
+
- gmpy2=2.2.1
|
32 |
+
- gnutls=3.6.15
|
33 |
+
- idna=3.7
|
34 |
+
- intel-openmp=2023.1.0
|
35 |
+
- jinja2=3.1.6
|
36 |
+
- jpeg=9e
|
37 |
+
- lame=3.100
|
38 |
+
- lcms2=2.16
|
39 |
+
- ld_impl_linux-64=2.40
|
40 |
+
- lerc=4.0.0
|
41 |
+
- libcublas=12.4.5.8
|
42 |
+
- libcufft=11.2.1.3
|
43 |
+
- libcufile=1.13.1.3
|
44 |
+
- libcurand=10.3.9.90
|
45 |
+
- libcusolver=11.6.1.9
|
46 |
+
- libcusparse=12.3.1.170
|
47 |
+
- libdeflate=1.22
|
48 |
+
- libffi=3.4.4
|
49 |
+
- libgcc-ng=11.2.0
|
50 |
+
- libgomp=11.2.0
|
51 |
+
- libiconv=1.16
|
52 |
+
- libidn2=2.3.4
|
53 |
+
- libjpeg-turbo=2.0.0
|
54 |
+
- libnpp=12.2.5.30
|
55 |
+
- libnvfatbin=12.8.90
|
56 |
+
- libnvjitlink=12.4.127
|
57 |
+
- libnvjpeg=12.3.1.117
|
58 |
+
- libpng=1.6.39
|
59 |
+
- libstdcxx-ng=11.2.0
|
60 |
+
- libtasn1=4.19.0
|
61 |
+
- libtiff=4.7.0
|
62 |
+
- libunistring=0.9.10
|
63 |
+
- libuuid=1.41.5
|
64 |
+
- libwebp=1.3.2
|
65 |
+
- libwebp-base=1.3.2
|
66 |
+
- llvm-openmp=14.0.6
|
67 |
+
- lz4-c=1.9.4
|
68 |
+
- markupsafe=3.0.2
|
69 |
+
- mkl=2023.1.0
|
70 |
+
- mkl-service=2.4.0
|
71 |
+
- mkl_fft=1.3.11
|
72 |
+
- mkl_random=1.2.8
|
73 |
+
- mpc=1.3.1
|
74 |
+
- mpfr=4.2.1
|
75 |
+
- mpmath=1.3.0
|
76 |
+
- ncurses=6.4
|
77 |
+
- nettle=3.7.3
|
78 |
+
- networkx=3.4.2
|
79 |
+
- ocl-icd=2.3.2
|
80 |
+
- openh264=2.1.1
|
81 |
+
- openjpeg=2.5.2
|
82 |
+
- openssl=3.0.16
|
83 |
+
- pillow=11.1.0
|
84 |
+
- pip=25.0
|
85 |
+
- pysocks=1.7.1
|
86 |
+
- python=3.10.16
|
87 |
+
- pytorch=2.5.0
|
88 |
+
- pytorch-cuda=12.4
|
89 |
+
- pytorch-mutex=1.0
|
90 |
+
- pyyaml=6.0.2
|
91 |
+
- readline=8.2
|
92 |
+
- requests=2.32.3
|
93 |
+
- sqlite=3.45.3
|
94 |
+
- tbb=2021.8.0
|
95 |
+
- tk=8.6.14
|
96 |
+
- torchaudio=2.5.0
|
97 |
+
- torchvision=0.20.0
|
98 |
+
- typing_extensions=4.12.2
|
99 |
+
- urllib3=2.3.0
|
100 |
+
- wheel=0.45.1
|
101 |
+
- xz=5.6.4
|
102 |
+
- yaml=0.2.5
|
103 |
+
- zlib=1.2.13
|
104 |
+
- zstd=1.5.6
|
105 |
+
- pip:
|
106 |
+
- absl-py=
|
107 |
+
- accelerate=
|
108 |
+
- addict=
|
109 |
+
- aiohappyeyeballs=
|
110 |
+
- aiohttp=
|
111 |
+
- aiosignal=
|
112 |
+
- albumentations=
|
113 |
+
- antlr4-python3-runtime=
|
114 |
+
- asttokens=
|
115 |
+
- async-timeout=
|
116 |
+
- attrs=
|
117 |
+
- av=
|
118 |
+
- braceexpand=
|
119 |
+
- cloudpickle=
|
120 |
+
- colorama=
|
121 |
+
- coloredlogs=
|
122 |
+
- contourpy=
|
123 |
+
- cycler=
|
124 |
+
- cython=
|
125 |
+
- decord=
|
126 |
+
- diffdist=
|
127 |
+
- diffusers=
|
128 |
+
- easydict=
|
129 |
+
- einops=
|
130 |
+
- executing=
|
131 |
+
- facexlib=
|
132 |
+
- filterpy=
|
133 |
+
- flash-attn=
|
134 |
+
- flatbuffers=
|
135 |
+
- fonttools=
|
136 |
+
- frozenlist=
|
137 |
+
- fsspec=
|
138 |
+
- ftfy=
|
139 |
+
- future=
|
140 |
+
- gfpgan=
|
141 |
+
- grpcio=
|
142 |
+
- h5py=
|
143 |
+
- huggingface-hub=
|
144 |
+
- humanfriendly=
|
145 |
+
- hydra-core=
|
146 |
+
- icecream=
|
147 |
+
- imageio=
|
148 |
+
- imageio-ffmpeg=
|
149 |
+
- imgaug=
|
150 |
+
- importlib-metadata=
|
151 |
+
- inflect=
|
152 |
+
- joblib=
|
153 |
+
- kiwisolver=
|
154 |
+
- kornia=
|
155 |
+
- kornia-rs=
|
156 |
+
- lazy-loader=
|
157 |
+
- lightning-utilities=
|
158 |
+
- llvmlite=
|
159 |
+
- lmdb=
|
160 |
+
- loguru=
|
161 |
+
- markdown=
|
162 |
+
- markdown-it-py=
|
163 |
+
- matplotlib=
|
164 |
+
- mdurl=
|
165 |
+
- multidict=
|
166 |
+
- natten=
|
167 |
+
- numba=
|
168 |
+
- numpy=
|
169 |
+
- nvidia-cublas-cu12=
|
170 |
+
- nvidia-cuda-cupti-cu12=
|
171 |
+
- nvidia-cuda-nvrtc-cu12=
|
172 |
+
- nvidia-cuda-runtime-cu12=
|
173 |
+
- nvidia-cudnn-cu12=
|
174 |
+
- nvidia-cufft-cu12=
|
175 |
+
- nvidia-curand-cu12=
|
176 |
+
- nvidia-cusolver-cu12=
|
177 |
+
- nvidia-cusparse-cu12=
|
178 |
+
- nvidia-cusparselt-cu12=
|
179 |
+
- nvidia-nccl-cu12=
|
180 |
+
- nvidia-nvjitlink-cu12=
|
181 |
+
- nvidia-nvtx-cu12=
|
182 |
+
- omegaconf=
|
183 |
+
- onnx=
|
184 |
+
- onnxruntime-gpu=
|
185 |
+
- open-clip-torch=
|
186 |
+
- opencv-python=
|
187 |
+
- opencv-python-headless=
|
188 |
+
- packaging=
|
189 |
+
- pandas=
|
190 |
+
- peft=
|
191 |
+
- platformdirs=
|
192 |
+
- plyfile=
|
193 |
+
- propcache=
|
194 |
+
- protobuf=
|
195 |
+
- psutil=
|
196 |
+
- py-cpuinfo=
|
197 |
+
- py360convert=
|
198 |
+
- pygments=
|
199 |
+
- pyparsing=
|
200 |
+
- python-dateutil=
|
201 |
+
- pytorch-lightning=
|
202 |
+
- pytz=
|
203 |
+
- qwen-vl-utils=
|
204 |
+
- regex=
|
205 |
+
- rich=
|
206 |
+
- safetensors=
|
207 |
+
- scikit-image=
|
208 |
+
- scikit-learn=
|
209 |
+
- scipy=
|
210 |
+
- seaborn=
|
211 |
+
- segment-anything=
|
212 |
+
- sentencepiece=
|
213 |
+
- setuptools=
|
214 |
+
- shapely=
|
215 |
+
- six=
|
216 |
+
- submitit=
|
217 |
+
- sympy=
|
218 |
+
- tabulate=
|
219 |
+
- tb-nightly=
|
220 |
+
- tensorboard-data-server=
|
221 |
+
- termcolor=
|
222 |
+
- threadpoolctl=
|
223 |
+
- tifffile=
|
224 |
+
- timm=
|
225 |
+
- tokenizers=
|
226 |
+
- tomli=
|
227 |
+
- torchmetrics=
|
228 |
+
- tqdm=
|
229 |
+
- transformers=
|
230 |
+
- triton=
|
231 |
+
- tzdata=
|
232 |
+
- ultralytics=
|
233 |
+
- ultralytics-thop=
|
234 |
+
- wcwidth=
|
235 |
+
- webdataset=
|
236 |
+
- werkzeug=
|
237 |
+
- wldhx-yadisk-direct=
|
238 |
+
- xformers=
|
239 |
+
- yapf=
|
240 |
+
- yarl=
|
241 |
+
- zipp=
|
242 |
+
- open3d>=0.18.0
|
243 |
+
- trimesh>=4.6.1
|
244 |
+
- cmake
|
245 |
+
- pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
|
246 |
+
- moge @ git+https://github.com/microsoft/MoGe.git
|
247 |
+
prefix: /opt/conda/envs/HunyuanWorld
|
docker/HunyuanWorld.yaml
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: HunyuanWorld
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- nvidia
|
5 |
+
- defaults
|
6 |
+
- https://repo.anaconda.com/pkgs/main
|
7 |
+
- https://repo.anaconda.com/pkgs/r
|
8 |
+
dependencies:
|
9 |
+
- _libgcc_mutex=0.1=main
|
10 |
+
- _openmp_mutex=5.1=1_gnu
|
11 |
+
- blas=1.0=mkl
|
12 |
+
- brotli-python=1.0.9=py310h6a678d5_9
|
13 |
+
- bzip2=1.0.8=h5eee18b_6
|
14 |
+
- ca-certificates=2025.2.25=h06a4308_0
|
15 |
+
- certifi=2025.1.31=py310h06a4308_0
|
16 |
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
17 |
+
- cuda-cudart=12.4.127=0
|
18 |
+
- cuda-cupti=12.4.127=0
|
19 |
+
- cuda-libraries=12.4.1=0
|
20 |
+
- cuda-nvrtc=12.4.127=0
|
21 |
+
- cuda-nvtx=12.4.127=0
|
22 |
+
- cuda-opencl=12.8.90=0
|
23 |
+
- cuda-runtime=12.4.1=0
|
24 |
+
- cuda-version=12.8=3
|
25 |
+
- ffmpeg=4.3=hf484d3e_0
|
26 |
+
- filelock=3.17.0=py310h06a4308_0
|
27 |
+
- freetype=2.13.3=h4a9f257_0
|
28 |
+
- giflib=5.2.2=h5eee18b_0
|
29 |
+
- gmp=6.3.0=h6a678d5_0
|
30 |
+
- gmpy2=2.2.1=py310h5eee18b_0
|
31 |
+
- gnutls=3.6.15=he1e5248_0
|
32 |
+
- idna=3.7=py310h06a4308_0
|
33 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
34 |
+
- jinja2=3.1.6=py310h06a4308_0
|
35 |
+
- jpeg=9e=h5eee18b_3
|
36 |
+
- lame=3.100=h7b6447c_0
|
37 |
+
- lcms2=2.16=h92b89f2_1
|
38 |
+
- ld_impl_linux-64=2.40=h12ee557_0
|
39 |
+
- lerc=4.0.0=h6a678d5_0
|
40 |
+
- libcublas=12.4.5.8=0
|
41 |
+
- libcufft=11.2.1.3=0
|
42 |
+
- libcufile=1.13.1.3=0
|
43 |
+
- libcurand=10.3.9.90=0
|
44 |
+
- libcusolver=11.6.1.9=0
|
45 |
+
- libcusparse=12.3.1.170=0
|
46 |
+
- libdeflate=1.22=h5eee18b_0
|
47 |
+
- libffi=3.4.4=h6a678d5_1
|
48 |
+
- libgcc-ng=11.2.0=h1234567_1
|
49 |
+
- libgomp=11.2.0=h1234567_1
|
50 |
+
- libiconv=1.16=h5eee18b_3
|
51 |
+
- libidn2=2.3.4=h5eee18b_0
|
52 |
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
53 |
+
- libnpp=12.2.5.30=0
|
54 |
+
- libnvfatbin=12.8.90=0
|
55 |
+
- libnvjitlink=12.4.127=0
|
56 |
+
- libnvjpeg=12.3.1.117=0
|
57 |
+
- libpng=1.6.39=h5eee18b_0
|
58 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
59 |
+
- libtasn1=4.19.0=h5eee18b_0
|
60 |
+
- libtiff=4.7.0=hde9077f_0
|
61 |
+
- libunistring=0.9.10=h27cfd23_0
|
62 |
+
- libuuid=1.41.5=h5eee18b_0
|
63 |
+
- libwebp=1.3.2=h9f374a3_1
|
64 |
+
- libwebp-base=1.3.2=h5eee18b_1
|
65 |
+
- llvm-openmp=14.0.6=h9e868ea_0
|
66 |
+
- lz4-c=1.9.4=h6a678d5_1
|
67 |
+
- markupsafe=3.0.2=py310h5eee18b_0
|
68 |
+
- mkl=2023.1.0=h213fc3f_46344
|
69 |
+
- mkl-service=2.4.0=py310h5eee18b_2
|
70 |
+
- mkl_fft=1.3.11=py310h5eee18b_0
|
71 |
+
- mkl_random=1.2.8=py310h1128e8f_0
|
72 |
+
- mpc=1.3.1=h5eee18b_0
|
73 |
+
- mpfr=4.2.1=h5eee18b_0
|
74 |
+
- mpmath=1.3.0=py310h06a4308_0
|
75 |
+
- ncurses=6.4=h6a678d5_0
|
76 |
+
- nettle=3.7.3=hbbd107a_1
|
77 |
+
- networkx=3.4.2=py310h06a4308_0
|
78 |
+
- ocl-icd=2.3.2=h5eee18b_1
|
79 |
+
- openh264=2.1.1=h4ff587b_0
|
80 |
+
- openjpeg=2.5.2=h0d4d230_1
|
81 |
+
- openssl=3.0.16=h5eee18b_0
|
82 |
+
- pillow=11.1.0=py310hac6e08b_1
|
83 |
+
- pip=25.0=py310h06a4308_0
|
84 |
+
- pysocks=1.7.1=py310h06a4308_0
|
85 |
+
- python=3.10.16=he870216_1
|
86 |
+
- pytorch=2.5.0=py3.10_cuda12.4_cudnn9.1.0_0
|
87 |
+
- pytorch-cuda=12.4=hc786d27_7
|
88 |
+
- pytorch-mutex=1.0=cuda
|
89 |
+
- pyyaml=6.0.2=py310h5eee18b_0
|
90 |
+
- readline=8.2=h5eee18b_0
|
91 |
+
- requests=2.32.3=py310h06a4308_1
|
92 |
+
- sqlite=3.45.3=h5eee18b_0
|
93 |
+
- tbb=2021.8.0=hdb19cb5_0
|
94 |
+
- tk=8.6.14=h39e8969_0
|
95 |
+
- torchaudio=2.5.0=py310_cu124
|
96 |
+
- torchvision=0.20.0=py310_cu124
|
97 |
+
- typing_extensions=4.12.2=py310h06a4308_0
|
98 |
+
- urllib3=2.3.0=py310h06a4308_0
|
99 |
+
- wheel=0.45.1=py310h06a4308_0
|
100 |
+
- xz=5.6.4=h5eee18b_1
|
101 |
+
- yaml=0.2.5=h7b6447c_0
|
102 |
+
- zlib=1.2.13=h5eee18b_1
|
103 |
+
- zstd=1.5.6=hc292b87_0
|
104 |
+
- pip:
|
105 |
+
- absl-py==2.2.2
|
106 |
+
- accelerate==1.6.0
|
107 |
+
- addict==2.4.0
|
108 |
+
- aiohappyeyeballs==2.6.1
|
109 |
+
- aiohttp==3.11.16
|
110 |
+
- aiosignal==1.3.2
|
111 |
+
- albumentations==0.5.2
|
112 |
+
- antlr4-python3-runtime==4.8
|
113 |
+
- asttokens==3.0.0
|
114 |
+
- async-timeout==5.0.1
|
115 |
+
- attrs==25.3.0
|
116 |
+
- av==14.3.0
|
117 |
+
- braceexpand==0.1.7
|
118 |
+
- cloudpickle==3.1.1
|
119 |
+
- colorama==0.4.6
|
120 |
+
- coloredlogs==15.0.1
|
121 |
+
- contourpy==1.3.2
|
122 |
+
- cycler==0.12.1
|
123 |
+
- cython==3.0.11
|
124 |
+
- decord==0.6.0
|
125 |
+
- diffdist==0.1
|
126 |
+
- diffusers==0.32.0
|
127 |
+
- easydict==1.9
|
128 |
+
- einops==0.4.1
|
129 |
+
- executing==2.2.0
|
130 |
+
- facexlib==0.3.0
|
131 |
+
- filterpy==1.4.5
|
132 |
+
- flash-attn==2.7.4.post1
|
133 |
+
- flatbuffers==25.2.10
|
134 |
+
- fonttools==4.57.0
|
135 |
+
- frozenlist==1.6.0
|
136 |
+
- fsspec==2025.3.2
|
137 |
+
- ftfy==6.1.1
|
138 |
+
- future==1.0.0
|
139 |
+
- gfpgan==1.3.8
|
140 |
+
- grpcio==1.71.0
|
141 |
+
- h5py==3.7.0
|
142 |
+
- huggingface-hub==0.30.2
|
143 |
+
- humanfriendly==10.0
|
144 |
+
- hydra-core==1.1.0
|
145 |
+
- icecream==2.1.2
|
146 |
+
- imageio==2.37.0
|
147 |
+
- imageio-ffmpeg==0.4.9
|
148 |
+
- imgaug==0.4.0
|
149 |
+
- importlib-metadata==8.6.1
|
150 |
+
- inflect==5.6.0
|
151 |
+
- joblib==1.4.2
|
152 |
+
- kiwisolver==1.4.8
|
153 |
+
- kornia==0.8.0
|
154 |
+
- kornia-rs==0.1.8
|
155 |
+
- lazy-loader==0.4
|
156 |
+
- lightning-utilities==0.14.3
|
157 |
+
- llvmlite==0.44.0
|
158 |
+
- lmdb==1.6.2
|
159 |
+
- loguru==0.7.3
|
160 |
+
- markdown==3.8
|
161 |
+
- markdown-it-py==3.0.0
|
162 |
+
- matplotlib==3.10.1
|
163 |
+
- mdurl==0.1.2
|
164 |
+
- multidict==6.4.3
|
165 |
+
- natten==0.14.4
|
166 |
+
- numba==0.61.2
|
167 |
+
- numpy==1.24.1
|
168 |
+
- nvidia-cublas-cu12==12.4.5.8
|
169 |
+
- nvidia-cuda-cupti-cu12==12.4.127
|
170 |
+
- nvidia-cuda-nvrtc-cu12==12.4.127
|
171 |
+
- nvidia-cuda-runtime-cu12==12.4.127
|
172 |
+
- nvidia-cudnn-cu12==9.1.0.70
|
173 |
+
- nvidia-cufft-cu12==11.2.1.3
|
174 |
+
- nvidia-curand-cu12==10.3.5.147
|
175 |
+
- nvidia-cusolver-cu12==11.6.1.9
|
176 |
+
- nvidia-cusparse-cu12==12.3.1.170
|
177 |
+
- nvidia-cusparselt-cu12==0.6.2
|
178 |
+
- nvidia-nccl-cu12==2.21.5
|
179 |
+
- nvidia-nvjitlink-cu12==12.4.127
|
180 |
+
- nvidia-nvtx-cu12==12.4.127
|
181 |
+
- omegaconf==2.1.2
|
182 |
+
- onnx==1.17.0
|
183 |
+
- onnxruntime-gpu==1.21.1
|
184 |
+
- open-clip-torch==2.30.0
|
185 |
+
- opencv-python==4.11.0.86
|
186 |
+
- opencv-python-headless==4.11.0.86
|
187 |
+
- packaging==24.2
|
188 |
+
- pandas==2.2.3
|
189 |
+
- peft==0.14.0
|
190 |
+
- platformdirs==4.3.7
|
191 |
+
- plyfile==1.1
|
192 |
+
- propcache==0.3.1
|
193 |
+
- protobuf==5.29.3
|
194 |
+
- psutil==7.0.0
|
195 |
+
- py-cpuinfo==9.0.0
|
196 |
+
- py360convert==1.0.3
|
197 |
+
- pygments==2.19.1
|
198 |
+
- pyparsing==3.2.3
|
199 |
+
- python-dateutil==2.9.0.post0
|
200 |
+
- pytorch-lightning==2.4.0
|
201 |
+
- pytz==2025.2
|
202 |
+
- qwen-vl-utils==0.0.8
|
203 |
+
- regex==2022.6.2
|
204 |
+
- rich==14.0.0
|
205 |
+
- safetensors==0.5.3
|
206 |
+
- scikit-image==0.24.0
|
207 |
+
- scikit-learn==1.6.1
|
208 |
+
- scipy==1.15.2
|
209 |
+
- seaborn==0.13.2
|
210 |
+
- segment-anything==1.0
|
211 |
+
- sentencepiece==0.2.0
|
212 |
+
- setuptools==59.5.0
|
213 |
+
- shapely==2.0.7
|
214 |
+
- six==1.17.0
|
215 |
+
- submitit==1.4.2
|
216 |
+
- sympy==1.13.1
|
217 |
+
- tabulate==0.9.0
|
218 |
+
- tb-nightly==2.20.0a20250421
|
219 |
+
- tensorboard-data-server==0.7.2
|
220 |
+
- termcolor==3.0.1
|
221 |
+
- threadpoolctl==3.6.0
|
222 |
+
- tifffile==2025.3.30
|
223 |
+
- timm==1.0.13
|
224 |
+
- tokenizers==0.21.1
|
225 |
+
- tomli==2.2.1
|
226 |
+
- torchmetrics==1.7.1
|
227 |
+
- tqdm==4.67.1
|
228 |
+
- transformers==4.51.0
|
229 |
+
- triton==3.2.0
|
230 |
+
- tzdata==2025.2
|
231 |
+
- ultralytics==8.3.74
|
232 |
+
- ultralytics-thop==2.0.14
|
233 |
+
- wcwidth==0.2.13
|
234 |
+
- webdataset==0.2.100
|
235 |
+
- werkzeug==3.1.3
|
236 |
+
- wldhx-yadisk-direct==0.0.6
|
237 |
+
- xformers==0.0.28.post2
|
238 |
+
- yapf==0.43.0
|
239 |
+
- yarl==1.20.0
|
240 |
+
- zipp==3.21.0
|
241 |
+
- open3d>=0.18.0
|
242 |
+
- trimesh>=4.6.1
|
243 |
+
- cmake
|
244 |
+
- pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
|
245 |
+
- moge @ git+https://github.com/microsoft/MoGe.git
|
246 |
+
prefix: /opt/conda/envs/HunyuanWorld
|
docker/HunyuanWorld_mac.yaml
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: HunyuanWorld-mac
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- conda-forge
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- python=3.10
|
8 |
+
- pytorch
|
9 |
+
- torchvision
|
10 |
+
- torchaudio
|
11 |
+
- ffmpeg
|
12 |
+
- filelock
|
13 |
+
- freetype
|
14 |
+
- gmp
|
15 |
+
- gmpy2
|
16 |
+
- gnutls
|
17 |
+
- idna
|
18 |
+
- jinja2
|
19 |
+
- jpeg
|
20 |
+
- lame
|
21 |
+
- lcms2
|
22 |
+
- lerc
|
23 |
+
- libdeflate
|
24 |
+
- libffi
|
25 |
+
- libiconv
|
26 |
+
- libidn2
|
27 |
+
- libpng
|
28 |
+
- libtasn1
|
29 |
+
- libtiff
|
30 |
+
- libunistring
|
31 |
+
- libuuid
|
32 |
+
- libwebp
|
33 |
+
- llvm-openmp
|
34 |
+
- lz4-c
|
35 |
+
- markupsafe
|
36 |
+
- mpc
|
37 |
+
- mpfr
|
38 |
+
- mpmath
|
39 |
+
- ncurses
|
40 |
+
- nettle
|
41 |
+
- networkx
|
42 |
+
- openh264
|
43 |
+
- openjpeg
|
44 |
+
- openssl
|
45 |
+
- pillow
|
46 |
+
- pip
|
47 |
+
- pysocks
|
48 |
+
- pyyaml
|
49 |
+
- readline
|
50 |
+
- requests
|
51 |
+
- sqlite
|
52 |
+
- tbb
|
53 |
+
- tk
|
54 |
+
- typing_extensions
|
55 |
+
- urllib3
|
56 |
+
- wheel
|
57 |
+
- xz
|
58 |
+
- yaml
|
59 |
+
- zlib
|
60 |
+
- zstd
|
61 |
+
- pip:
|
62 |
+
- absl-py==2.2.2
|
63 |
+
- accelerate==1.6.0
|
64 |
+
- addict==2.4.0
|
65 |
+
- aiohappyeyeballs==2.6.1
|
66 |
+
- aiohttp==3.11.16
|
67 |
+
- aiosignal==1.3.2
|
68 |
+
- albumentations==0.5.2
|
69 |
+
- antlr4-python3-runtime==4.8
|
70 |
+
- asttokens==3.0.0
|
71 |
+
- async-timeout==5.0.1
|
72 |
+
- attrs==25.3.0
|
73 |
+
- av==14.3.0
|
74 |
+
- braceexpand==0.1.7
|
75 |
+
- cloudpickle==3.1.1
|
76 |
+
- colorama==0.4.6
|
77 |
+
- coloredlogs==15.0.1
|
78 |
+
- contourpy==1.3.2
|
79 |
+
- cycler==0.12.1
|
80 |
+
- cython==3.0.11
|
81 |
+
- decord==0.6.0
|
82 |
+
- diffdist==0.1
|
83 |
+
- diffusers==0.32.0
|
84 |
+
- easydict==1.9
|
85 |
+
- einops==0.4.1
|
86 |
+
- executing==2.2.0
|
87 |
+
- facexlib==0.3.0
|
88 |
+
- filterpy==1.4.5
|
89 |
+
- flatbuffers==25.2.10
|
90 |
+
- fonttools==4.57.0
|
91 |
+
- frozenlist==1.6.0
|
92 |
+
- fsspec==2025.3.2
|
93 |
+
- ftfy==6.1.1
|
94 |
+
- future==1.0.0
|
95 |
+
- gfpgan==1.3.8
|
96 |
+
- grpcio==1.71.0
|
97 |
+
- h5py==3.7.0
|
98 |
+
- huggingface-hub==0.30.2
|
99 |
+
- humanfriendly==10.0
|
100 |
+
- hydra-core==1.1.0
|
101 |
+
- icecream==2.1.2
|
102 |
+
- imageio==2.37.0
|
103 |
+
- imageio-ffmpeg==0.4.9
|
104 |
+
- imgaug==0.4.0
|
105 |
+
- importlib-metadata==8.6.1
|
106 |
+
- inflect==5.6.0
|
107 |
+
- joblib==1.4.2
|
108 |
+
- kiwisolver==1.4.8
|
109 |
+
- kornia==0.8.0
|
110 |
+
- kornia-rs==0.1.8
|
111 |
+
- lazy-loader==0.4
|
112 |
+
- lightning-utilities==0.14.3
|
113 |
+
- llvmlite==0.44.0
|
114 |
+
- lmdb==1.6.2
|
115 |
+
- loguru==0.7.3
|
116 |
+
- markdown==3.8
|
117 |
+
- markdown-it-py==3.0.0
|
118 |
+
- matplotlib==3.10.1
|
119 |
+
- mdurl==0.1.2
|
120 |
+
- multidict==6.4.3
|
121 |
+
- natten==0.14.4
|
122 |
+
- numba==0.61.2
|
123 |
+
- numpy==1.24.1
|
124 |
+
- omegaconf==2.1.2
|
125 |
+
- onnx==1.17.0
|
126 |
+
- onnxruntime
|
127 |
+
- open-clip-torch==2.30.0
|
128 |
+
- opencv-python==4.11.0.86
|
129 |
+
- opencv-python-headless==4.11.0.86
|
130 |
+
- packaging==24.2
|
131 |
+
- pandas==2.2.3
|
132 |
+
- peft==0.14.0
|
133 |
+
- platformdirs==4.3.7
|
134 |
+
- plyfile==1.1
|
135 |
+
- propcache==0.3.1
|
136 |
+
- protobuf==5.29.3
|
137 |
+
- psutil==7.0.0
|
138 |
+
- py-cpuinfo==9.0.0
|
139 |
+
- py360convert==1.0.3
|
140 |
+
- pygments==2.19.1
|
141 |
+
- pyparsing==3.2.3
|
142 |
+
- python-dateutil==2.9.0.post0
|
143 |
+
- pytorch-lightning==2.4.0
|
144 |
+
- pytz==2025.2
|
145 |
+
- qwen-vl-utils==0.0.8
|
146 |
+
- regex==2022.6.2
|
147 |
+
- rich==14.0.0
|
148 |
+
- safetensors==0.5.3
|
149 |
+
- scikit-image==0.24.0
|
150 |
+
- scikit-learn==1.6.1
|
151 |
+
- scipy==1.15.2
|
152 |
+
- seaborn==0.13.2
|
153 |
+
- segment-anything==1.0
|
154 |
+
- sentencepiece==0.2.0
|
155 |
+
- setuptools==59.5.0
|
156 |
+
- shapely==2.0.7
|
157 |
+
- six==1.17.0
|
158 |
+
- submitit==1.4.2
|
159 |
+
- sympy==1.13.1
|
160 |
+
- tabulate==0.9.0
|
161 |
+
- tb-nightly==2.20.0a20250421
|
162 |
+
- tensorboard-data-server==0.7.2
|
163 |
+
- termcolor==3.0.1
|
164 |
+
- threadpoolctl==3.6.0
|
165 |
+
- tifffile==2025.3.30
|
166 |
+
- timm==1.0.13
|
167 |
+
- tokenizers==0.21.1
|
168 |
+
- tomli==2.2.1
|
169 |
+
- torchmetrics==1.7.1
|
170 |
+
- tqdm==4.67.1
|
171 |
+
- transformers==4.51.0
|
172 |
+
- tzdata==2025.2
|
173 |
+
- ultralytics==8.3.74
|
174 |
+
- ultralytics-thop==2.0.14
|
175 |
+
- wcwidth==0.2.13
|
176 |
+
- webdataset==0.2.100
|
177 |
+
- werkzeug==3.1.3
|
178 |
+
- wldhx-yadisk-direct==0.0.6
|
179 |
+
- yapf==0.43.0
|
180 |
+
- yarl==1.20.0
|
181 |
+
- zipp==3.21.0
|
182 |
+
- open3d>=0.18.0
|
183 |
+
- trimesh>=4.6.1
|
184 |
+
- cmake
|
185 |
+
- pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git
|
186 |
+
- moge @ git+https://github.com/microsoft/MoGe.git
|
examples/case1/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case1/input.png
ADDED
![]() |
Git LFS Details
|
examples/case2/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case2/input.png
ADDED
![]() |
Git LFS Details
|
examples/case2/labels_fg1.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
stones
|
examples/case2/labels_fg2.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
trees
|
examples/case3/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case3/input.png
ADDED
![]() |
Git LFS Details
|
examples/case4/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case4/prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
There is a rocky island on the vast sea surface, with a triangular rock burning red flames in the center of the island. The sea is open and rough, with a green surface. Surrounded by towering peaks in the distance.
|
examples/case5/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case5/input.png
ADDED
![]() |
Git LFS Details
|
examples/case6/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case6/input.png
ADDED
![]() |
Git LFS Details
|
examples/case6/labels_fg1.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tent
|
examples/case7/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case7/prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
At the moment of glacier collapse, giant ice walls collapse and create waves, with no wildlife, captured in a disaster documentary
|
examples/case8/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case8/input.png
ADDED
![]() |
Git LFS Details
|
examples/case9/classes.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
outdoor
|
examples/case9/prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
A breathtaking volcanic eruption scene. In the center of the screen, one or more volcanoes are erupting violently, with hot orange red lava gushing out from the crater, illuminating the surrounding night sky and landscape. Thick smoke and volcanic ash rose into the sky, forming a huge mushroom cloud like structure. Some of the smoke and dust were reflected in a dark red color by the high temperature of the lava, creating a doomsday atmosphere. In the foreground, a winding lava flow flows through the dark and rough rocks like a fire snake, emitting a dazzling light as if burning the earth. The steep and rugged mountains in the background further emphasize the ferocity and irresistible power of nature. The entire picture has a strong contrast of light and shadow, with red, black, and gray as the main colors, highlighting the visual impact and dramatic tension of volcanic eruptions, making people feel the grandeur and terror of nature.
|
hy3dworld/__init__.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
# Image to Panorama
|
15 |
+
from .models import Image2PanoramaPipelines
|
16 |
+
from .utils import Perspective
|
17 |
+
# Text to Panorama
|
18 |
+
from .models import Text2PanoramaPipelines
|
19 |
+
# Sence Generation
|
20 |
+
from .models import LayerDecomposition
|
21 |
+
from .models import WorldComposer
|
22 |
+
from .utils import process_file
|
hy3dworld/models/__init__.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
|
15 |
+
# Image to Panorama
|
16 |
+
from .pano_generator import Image2PanoramaPipelines
|
17 |
+
# Text to Panorama
|
18 |
+
from .pano_generator import Text2PanoramaPipelines
|
19 |
+
|
20 |
+
# Scene Generation
|
21 |
+
from .pipelines import FluxPipeline, FluxFillPipeline
|
22 |
+
from .layer_decomposer import LayerDecomposition
|
23 |
+
from .world_composer import WorldComposer
|
24 |
+
|
25 |
+
__all__ = [
|
26 |
+
"Image2PanoramaPipelines", "Text2PanoramaPipelines",
|
27 |
+
"FluxPipeline", "FluxFillPipeline",
|
28 |
+
"LayerDecomposition", "WorldComposer",
|
29 |
+
]
|
hy3dworld/models/adaptive_depth_compression.py
ADDED
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
import torch
|
15 |
+
from typing import List, Dict, Tuple
|
16 |
+
|
17 |
+
|
18 |
+
class AdaptiveDepthCompressor:
|
19 |
+
r"""
|
20 |
+
Adaptive depth compressor to solve the problem of excessive background depth variance
|
21 |
+
in 3D world generation. This class provides methods to compress background and foreground
|
22 |
+
depth values based on statistical analysis of depth distributions, with options for
|
23 |
+
smooth compression and outlier removal.
|
24 |
+
Args:
|
25 |
+
cv_thresholds: Tuple of (low, high) thresholds for coefficient of variation (CV).
|
26 |
+
compression_quantiles: Tuple of (low, medium, high) quantiles for depth compression.
|
27 |
+
fg_bg_depth_margin: Margin factor to ensure foreground depth is greater than background.
|
28 |
+
enable_smooth_compression: Whether to use smooth compression instead of hard truncation.
|
29 |
+
outlier_removal_method: Method for outlier removal, options are "iqr", "quantile", or "none".
|
30 |
+
min_compression_depth: Minimum depth threshold for compression to be applied.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
cv_thresholds: Tuple[float, float] = (0.3, 0.8),
|
36 |
+
compression_quantiles: Tuple[float, float, float] = (0.95, 0.92, 0.85),
|
37 |
+
fg_bg_depth_margin: float = 1.1,
|
38 |
+
enable_smooth_compression: bool = True,
|
39 |
+
outlier_removal_method: str = "iqr",
|
40 |
+
min_compression_depth: float = 6.0,
|
41 |
+
):
|
42 |
+
self.cv_thresholds = cv_thresholds
|
43 |
+
self.compression_quantiles = compression_quantiles
|
44 |
+
self.fg_bg_depth_margin = fg_bg_depth_margin
|
45 |
+
self.enable_smooth_compression = enable_smooth_compression
|
46 |
+
self.outlier_removal_method = outlier_removal_method
|
47 |
+
self.min_compression_depth = min_compression_depth
|
48 |
+
|
49 |
+
def _remove_outliers(self, depth_vals: torch.Tensor) -> torch.Tensor:
|
50 |
+
r"""
|
51 |
+
Remove outliers from depth values
|
52 |
+
based on the specified method (IQR or quantile).
|
53 |
+
Args:
|
54 |
+
depth_vals: Tensor of depth values to process.
|
55 |
+
Returns:
|
56 |
+
Tensor of depth values with outliers removed.
|
57 |
+
"""
|
58 |
+
if self.outlier_removal_method == "iqr":
|
59 |
+
q25, q75 = torch.quantile(depth_vals, torch.tensor(
|
60 |
+
[0.25, 0.75], device=depth_vals.device))
|
61 |
+
iqr = q75 - q25
|
62 |
+
lower_bound, upper_bound = q25 - 1.5 * iqr, q75 + 1.5 * iqr
|
63 |
+
valid_mask = (depth_vals >= lower_bound) & (
|
64 |
+
depth_vals <= upper_bound)
|
65 |
+
elif self.outlier_removal_method == "quantile":
|
66 |
+
q05, q95 = torch.quantile(depth_vals, torch.tensor(
|
67 |
+
[0.05, 0.95], device=depth_vals.device))
|
68 |
+
valid_mask = (depth_vals >= q05) & (depth_vals <= q95)
|
69 |
+
else:
|
70 |
+
return depth_vals
|
71 |
+
return depth_vals[valid_mask] if valid_mask.sum() > 0 else depth_vals
|
72 |
+
|
73 |
+
def _collect_foreground_depths(
|
74 |
+
self,
|
75 |
+
layered_world_depth: List[Dict]
|
76 |
+
) -> List[torch.Tensor]:
|
77 |
+
r"""
|
78 |
+
Collect depth information of all foreground layers (remove outliers)
|
79 |
+
from the layered world depth representation.
|
80 |
+
Args:
|
81 |
+
layered_world_depth: List of dictionaries containing depth information for each layer.
|
82 |
+
Returns:
|
83 |
+
List of tensors containing cleaned foreground depth values.
|
84 |
+
"""
|
85 |
+
fg_depths = []
|
86 |
+
for layer_depth in layered_world_depth:
|
87 |
+
if layer_depth["name"] == "background":
|
88 |
+
continue
|
89 |
+
|
90 |
+
depth_vals = layer_depth["distance"]
|
91 |
+
mask = layer_depth.get("mask", None)
|
92 |
+
|
93 |
+
# Process the depth values within the mask area
|
94 |
+
if mask is not None:
|
95 |
+
if not isinstance(mask, torch.Tensor):
|
96 |
+
mask = torch.from_numpy(mask).to(depth_vals.device)
|
97 |
+
depth_vals = depth_vals[mask.bool()]
|
98 |
+
|
99 |
+
if depth_vals.numel() > 0:
|
100 |
+
cleaned_depths = self._remove_outliers(depth_vals)
|
101 |
+
if len(cleaned_depths) > 0:
|
102 |
+
fg_depths.append(cleaned_depths)
|
103 |
+
return fg_depths
|
104 |
+
|
105 |
+
def _get_pixelwise_foreground_max_depth(
|
106 |
+
self,
|
107 |
+
layered_world_depth: List[Dict],
|
108 |
+
bg_shape: torch.Size,
|
109 |
+
bg_device: torch.device
|
110 |
+
) -> torch.Tensor:
|
111 |
+
r"""
|
112 |
+
Calculate the maximum foreground depth for each pixel position
|
113 |
+
Args:
|
114 |
+
layered_world_depth: List of dictionaries containing depth information for each layer.
|
115 |
+
bg_shape: Shape of the background depth tensor.
|
116 |
+
bg_device: Device where the background depth tensor is located.
|
117 |
+
Returns:
|
118 |
+
Tensor of maximum foreground depth values for each pixel position.
|
119 |
+
"""
|
120 |
+
fg_max_depth = torch.zeros(bg_shape, device=bg_device)
|
121 |
+
|
122 |
+
for layer_depth in layered_world_depth:
|
123 |
+
if layer_depth["name"] == "background":
|
124 |
+
continue
|
125 |
+
|
126 |
+
layer_distance = layer_depth["distance"]
|
127 |
+
layer_mask = layer_depth.get("mask", None)
|
128 |
+
|
129 |
+
# Ensure that the tensor is on the correct device
|
130 |
+
if not isinstance(layer_distance, torch.Tensor):
|
131 |
+
layer_distance = torch.from_numpy(layer_distance).to(bg_device)
|
132 |
+
else:
|
133 |
+
layer_distance = layer_distance.to(bg_device)
|
134 |
+
|
135 |
+
# Update the maximum depth of the foreground
|
136 |
+
if layer_mask is not None:
|
137 |
+
if not isinstance(layer_mask, torch.Tensor):
|
138 |
+
layer_mask = torch.from_numpy(layer_mask).to(bg_device)
|
139 |
+
else:
|
140 |
+
layer_mask = layer_mask.to(bg_device)
|
141 |
+
fg_max_depth = torch.where(layer_mask.bool(), torch.max(
|
142 |
+
fg_max_depth, layer_distance), fg_max_depth)
|
143 |
+
else:
|
144 |
+
fg_max_depth = torch.max(fg_max_depth, layer_distance)
|
145 |
+
|
146 |
+
return fg_max_depth
|
147 |
+
|
148 |
+
def _analyze_depth_distribution(self, bg_depth_distance: torch.Tensor) -> Dict:
|
149 |
+
r"""
|
150 |
+
Analyze the distribution characteristics of background depth
|
151 |
+
Args:
|
152 |
+
bg_depth_distance: Tensor of background depth distances.
|
153 |
+
Returns:
|
154 |
+
Dictionary containing statistical properties of the background depth distribution.
|
155 |
+
"""
|
156 |
+
bg_mean, bg_std = torch.mean(
|
157 |
+
bg_depth_distance), torch.std(bg_depth_distance)
|
158 |
+
cv = bg_std / bg_mean
|
159 |
+
|
160 |
+
quantiles = torch.quantile(bg_depth_distance,
|
161 |
+
torch.tensor([0.5, 0.75, 0.9, 0.95, 0.99], device=bg_depth_distance.device))
|
162 |
+
bg_q50, bg_q75, bg_q90, bg_q95, bg_q99 = quantiles
|
163 |
+
|
164 |
+
return {"mean": bg_mean, "std": bg_std, "cv": cv, "q50": bg_q50,
|
165 |
+
"q75": bg_q75, "q90": bg_q90, "q95": bg_q95, "q99": bg_q99}
|
166 |
+
|
167 |
+
def _determine_compression_strategy(self, cv: float) -> Tuple[str, float]:
|
168 |
+
r"""
|
169 |
+
Determine compression strategy based on coefficient of variation
|
170 |
+
Args:
|
171 |
+
cv: Coefficient of variation of the background depth distribution.
|
172 |
+
Returns:
|
173 |
+
Tuple containing the compression strategy ("gentle", "standard", "aggressive")
|
174 |
+
and the quantile to use for compression.
|
175 |
+
"""
|
176 |
+
low_cv_threshold, high_cv_threshold = self.cv_thresholds
|
177 |
+
low_var_quantile, medium_var_quantile, high_var_quantile = self.compression_quantiles
|
178 |
+
|
179 |
+
if cv < low_cv_threshold:
|
180 |
+
return "gentle", low_var_quantile
|
181 |
+
elif cv > high_cv_threshold:
|
182 |
+
return "aggressive", high_var_quantile
|
183 |
+
else:
|
184 |
+
return "standard", medium_var_quantile
|
185 |
+
|
186 |
+
def _smooth_compression(self, depth_values: torch.Tensor, max_depth: torch.Tensor,
|
187 |
+
mask: torch.Tensor = None, transition_start_ratio: float = 0.95,
|
188 |
+
transition_range_ratio: float = 0.2, verbose: bool = False) -> torch.Tensor:
|
189 |
+
r"""
|
190 |
+
Use smooth compression function instead of hard truncation
|
191 |
+
Args:
|
192 |
+
depth_values: Tensor of depth values to compress.
|
193 |
+
max_depth: Maximum depth value for compression.
|
194 |
+
mask: Optional mask to apply compression only to certain pixels.
|
195 |
+
transition_start_ratio: Ratio to determine the start of the transition range.
|
196 |
+
transition_range_ratio: Ratio to determine the range of the transition.
|
197 |
+
verbose: Whether to print detailed information about the compression process.
|
198 |
+
Returns:
|
199 |
+
Compressed depth values as a tensor.
|
200 |
+
"""
|
201 |
+
if not self.enable_smooth_compression:
|
202 |
+
compressed = depth_values.clone()
|
203 |
+
if mask is not None:
|
204 |
+
compressed[mask] = torch.clamp(
|
205 |
+
depth_values[mask], max=max_depth)
|
206 |
+
else:
|
207 |
+
compressed = torch.clamp(depth_values, max=max_depth)
|
208 |
+
return compressed
|
209 |
+
|
210 |
+
transition_start = max_depth * transition_start_ratio
|
211 |
+
transition_range = max_depth * transition_range_ratio
|
212 |
+
compressed_depth = depth_values.clone()
|
213 |
+
|
214 |
+
mask_far = depth_values > transition_start
|
215 |
+
if mask is not None:
|
216 |
+
mask_far = mask_far & mask
|
217 |
+
|
218 |
+
if mask_far.sum() > 0:
|
219 |
+
far_depths = depth_values[mask_far]
|
220 |
+
normalized = (far_depths - transition_start) / transition_range
|
221 |
+
compressed_normalized = torch.sigmoid(
|
222 |
+
normalized * 2 - 1) * 0.5 + 0.5
|
223 |
+
compressed_far = transition_start + \
|
224 |
+
compressed_normalized * (max_depth - transition_start)
|
225 |
+
compressed_depth[mask_far] = compressed_far
|
226 |
+
if verbose:
|
227 |
+
print(
|
228 |
+
f"\t Applied smooth compression to {mask_far.sum()} pixels beyond {transition_start:.2f}")
|
229 |
+
elif verbose:
|
230 |
+
print(f"\t No compression needed, all depths within reasonable range")
|
231 |
+
|
232 |
+
return compressed_depth
|
233 |
+
|
234 |
+
def compress_background_depth(self, bg_depth_distance: torch.Tensor, layered_world_depth: List[Dict],
|
235 |
+
bg_mask: torch.Tensor, verbose: bool = False) -> torch.Tensor:
|
236 |
+
r"""
|
237 |
+
Adaptive compression of background depth values
|
238 |
+
Args:
|
239 |
+
bg_depth_distance: Tensor of background depth distances.
|
240 |
+
layered_world_depth: List of dictionaries containing depth information for each layer.
|
241 |
+
bg_mask: Tensor or numpy array representing the mask for background depth.
|
242 |
+
verbose: Whether to print detailed information about the compression process.
|
243 |
+
Returns:
|
244 |
+
Compressed background depth values as a tensor.
|
245 |
+
"""
|
246 |
+
if verbose:
|
247 |
+
print(f"\t - Applying adaptive depth compression...")
|
248 |
+
|
249 |
+
# Process mask
|
250 |
+
if not isinstance(bg_mask, torch.Tensor):
|
251 |
+
bg_mask = torch.from_numpy(bg_mask).to(bg_depth_distance.device)
|
252 |
+
mask_bool = bg_mask.bool()
|
253 |
+
masked_depths = bg_depth_distance[mask_bool]
|
254 |
+
|
255 |
+
if masked_depths.numel() == 0:
|
256 |
+
if verbose:
|
257 |
+
print(f"\t No valid depths in mask region, skipping compression")
|
258 |
+
return bg_depth_distance
|
259 |
+
|
260 |
+
# 1. Collect prospect depth information
|
261 |
+
fg_depths = self._collect_foreground_depths(layered_world_depth)
|
262 |
+
|
263 |
+
# 2. Calculate prospect depth statistics
|
264 |
+
if fg_depths:
|
265 |
+
all_fg_depths = torch.cat(fg_depths)
|
266 |
+
fg_max = torch.quantile(all_fg_depths, torch.tensor(
|
267 |
+
0.99, device=all_fg_depths.device))
|
268 |
+
if verbose:
|
269 |
+
print(
|
270 |
+
f"\t Foreground depth stats - 99th percentile: {fg_max:.2f}")
|
271 |
+
else:
|
272 |
+
fg_max = torch.quantile(masked_depths, torch.tensor(
|
273 |
+
0.5, device=masked_depths.device))
|
274 |
+
if verbose:
|
275 |
+
print(f"\t No foreground found, using background stats for reference")
|
276 |
+
|
277 |
+
# 3. Analyze the depth distribution of the background
|
278 |
+
depth_stats = self._analyze_depth_distribution(masked_depths)
|
279 |
+
if verbose:
|
280 |
+
print(
|
281 |
+
f"\t Background depth stats - mean: {depth_stats['mean']:.2f}, \
|
282 |
+
std: {depth_stats['std']:.2f}, CV: {depth_stats['cv']:.3f}")
|
283 |
+
|
284 |
+
# 4. Determine compression strategy
|
285 |
+
strategy, compression_quantile = self._determine_compression_strategy(
|
286 |
+
depth_stats['cv'])
|
287 |
+
max_depth = torch.quantile(masked_depths, torch.tensor(
|
288 |
+
compression_quantile, device=masked_depths.device))
|
289 |
+
|
290 |
+
if verbose:
|
291 |
+
print(
|
292 |
+
f"\t {strategy.capitalize()} compression strategy \
|
293 |
+
(CV={depth_stats['cv']:.3f}), quantile={compression_quantile}")
|
294 |
+
|
295 |
+
# 5. Pixel level depth constraint
|
296 |
+
if fg_depths:
|
297 |
+
fg_max_depth_pixelwise = self._get_pixelwise_foreground_max_depth(
|
298 |
+
layered_world_depth, bg_depth_distance.shape, bg_depth_distance.device)
|
299 |
+
required_min_bg_depth = fg_max_depth_pixelwise * self.fg_bg_depth_margin
|
300 |
+
pixelwise_violations = (
|
301 |
+
bg_depth_distance < required_min_bg_depth) & mask_bool
|
302 |
+
|
303 |
+
if pixelwise_violations.sum() > 0:
|
304 |
+
violation_ratio = pixelwise_violations.float().sum() / mask_bool.float().sum()
|
305 |
+
violated_required_depths = required_min_bg_depth[pixelwise_violations]
|
306 |
+
pixelwise_min_depth = torch.quantile(violated_required_depths, torch.tensor(
|
307 |
+
0.99, device=violated_required_depths.device))
|
308 |
+
max_depth = torch.max(max_depth, pixelwise_min_depth)
|
309 |
+
if verbose:
|
310 |
+
print(
|
311 |
+
f"\t Pixelwise constraint violation: {violation_ratio:.1%}, \
|
312 |
+
adjusted max depth to {max_depth:.2f}")
|
313 |
+
elif verbose:
|
314 |
+
print(f"\t Pixelwise depth constraints satisfied")
|
315 |
+
|
316 |
+
# 6. Global statistical constraints
|
317 |
+
if fg_depths:
|
318 |
+
min_bg_depth = fg_max * self.fg_bg_depth_margin
|
319 |
+
max_depth = torch.max(max_depth, min_bg_depth)
|
320 |
+
if verbose:
|
321 |
+
print(f"\t Final max depth: {max_depth:.2f}")
|
322 |
+
|
323 |
+
# 6.5. Depth threshold check: If max_depth is less than the threshold, skip compression
|
324 |
+
if max_depth < self.min_compression_depth:
|
325 |
+
if verbose:
|
326 |
+
print(
|
327 |
+
f"\t Max depth {max_depth:.2f} is below threshold \
|
328 |
+
{self.min_compression_depth:.2f}, skipping compression")
|
329 |
+
return bg_depth_distance
|
330 |
+
|
331 |
+
# 7. Application compression
|
332 |
+
compressed_depth = self._smooth_compression(
|
333 |
+
bg_depth_distance, max_depth, mask_bool, 0.9, 0.2, verbose)
|
334 |
+
|
335 |
+
# 8. Hard truncation of extreme outliers
|
336 |
+
final_max = max_depth * 1.2
|
337 |
+
outliers = (compressed_depth > final_max) & mask_bool
|
338 |
+
if outliers.sum() > 0:
|
339 |
+
compressed_depth[outliers] = final_max
|
340 |
+
|
341 |
+
# 9. statistic
|
342 |
+
compression_ratio = ((bg_depth_distance > max_depth)
|
343 |
+
& mask_bool).float().sum() / mask_bool.float().sum()
|
344 |
+
if verbose:
|
345 |
+
print(
|
346 |
+
f"\t Compression summary - max depth: \
|
347 |
+
{max_depth:.2f}, affected: {compression_ratio:.1%}")
|
348 |
+
|
349 |
+
return compressed_depth
|
350 |
+
|
351 |
+
def compress_foreground_depth(
|
352 |
+
self,
|
353 |
+
fg_depth_distance: torch.Tensor,
|
354 |
+
fg_mask: torch.Tensor,
|
355 |
+
verbose: bool = False,
|
356 |
+
conservative_ratio: float = 0.99,
|
357 |
+
iqr_scale: float = 2
|
358 |
+
) -> torch.Tensor:
|
359 |
+
r"""
|
360 |
+
Conservatively compress outliers for foreground depth
|
361 |
+
Args:
|
362 |
+
fg_depth_distance: Tensor of foreground depth distances.
|
363 |
+
fg_mask: Tensor or numpy array representing the mask for foreground depth.
|
364 |
+
verbose: Whether to print detailed information about the compression process.
|
365 |
+
conservative_ratio: Ratio to use for conservative compression.
|
366 |
+
iqr_scale: Scale factor for IQR-based upper bound.
|
367 |
+
Returns:
|
368 |
+
Compressed foreground depth values as a tensor.
|
369 |
+
"""
|
370 |
+
if verbose:
|
371 |
+
print(f"\t - Applying conservative foreground depth compression...")
|
372 |
+
|
373 |
+
# Process mask
|
374 |
+
if not isinstance(fg_mask, torch.Tensor):
|
375 |
+
fg_mask = torch.from_numpy(fg_mask).to(fg_depth_distance.device)
|
376 |
+
mask_bool = fg_mask.bool()
|
377 |
+
masked_depths = fg_depth_distance[mask_bool]
|
378 |
+
|
379 |
+
if masked_depths.numel() == 0:
|
380 |
+
if verbose:
|
381 |
+
print(f"\t No valid depths in mask region, skipping compression")
|
382 |
+
return fg_depth_distance
|
383 |
+
|
384 |
+
# Calculate statistical information
|
385 |
+
depth_mean, depth_std = torch.mean(
|
386 |
+
masked_depths), torch.std(masked_depths)
|
387 |
+
|
388 |
+
# Determine the upper bound using IQR and quantile methods
|
389 |
+
q25, q75 = torch.quantile(masked_depths, torch.tensor(
|
390 |
+
[0.25, 0.75], device=masked_depths.device))
|
391 |
+
iqr = q75 - q25
|
392 |
+
upper_bound = q75 + iqr_scale * iqr
|
393 |
+
conservative_max = torch.quantile(masked_depths, torch.tensor(
|
394 |
+
conservative_ratio, device=masked_depths.device))
|
395 |
+
final_max = torch.max(upper_bound, conservative_max)
|
396 |
+
|
397 |
+
# Statistical Outliers
|
398 |
+
outliers = (fg_depth_distance > final_max) & mask_bool
|
399 |
+
outlier_count = outliers.sum().item()
|
400 |
+
|
401 |
+
if verbose:
|
402 |
+
print(
|
403 |
+
f"\t Depth stats - mean: {depth_mean:.2f}, std: {depth_std:.2f}")
|
404 |
+
print(
|
405 |
+
f"\t IQR bounds - Q25: {q25:.2f}, Q75: {q75:.2f}, upper: {upper_bound:.2f}")
|
406 |
+
print(
|
407 |
+
f"\t Conservative max: {conservative_max:.2f}, final max: {final_max:.2f}")
|
408 |
+
print(
|
409 |
+
f"\t Outliers: {outlier_count} ({(outlier_count/masked_depths.numel()*100):.2f}%)")
|
410 |
+
|
411 |
+
# Depth threshold check: If final_max is less than the threshold, skip compression
|
412 |
+
if final_max < self.min_compression_depth:
|
413 |
+
if verbose:
|
414 |
+
print(
|
415 |
+
f"\t Final max depth {final_max:.2f} is below threshold \
|
416 |
+
{self.min_compression_depth:.2f}, skipping compression")
|
417 |
+
return fg_depth_distance
|
418 |
+
|
419 |
+
# Apply compression
|
420 |
+
if outlier_count > 0:
|
421 |
+
compressed_depth = self._smooth_compression(
|
422 |
+
fg_depth_distance, final_max, mask_bool, 0.99, 0.1, verbose)
|
423 |
+
else:
|
424 |
+
compressed_depth = fg_depth_distance.clone()
|
425 |
+
|
426 |
+
return compressed_depth
|
427 |
+
|
428 |
+
|
429 |
+
def create_adaptive_depth_compressor(
|
430 |
+
scene_type: str = "auto",
|
431 |
+
enable_smooth_compression: bool = True,
|
432 |
+
outlier_removal_method: str = "iqr",
|
433 |
+
min_compression_depth: float = 6.0, # Minimum compression depth threshold
|
434 |
+
) -> AdaptiveDepthCompressor:
|
435 |
+
r"""
|
436 |
+
Create adaptive depth compressors suitable for different scene types
|
437 |
+
Args:
|
438 |
+
scene_type: Scenario Type ("indoor", "outdoor", "mixed", "auto")
|
439 |
+
enable_smooth_compression: enable smooth compression or not
|
440 |
+
outlier_removal_method: Outlier removal method ("iqr", "quantile", "none")
|
441 |
+
"""
|
442 |
+
common_params = {
|
443 |
+
"enable_smooth_compression": enable_smooth_compression,
|
444 |
+
"outlier_removal_method": outlier_removal_method,
|
445 |
+
"min_compression_depth": min_compression_depth,
|
446 |
+
}
|
447 |
+
|
448 |
+
if scene_type == "indoor":
|
449 |
+
# Indoor scene: Depth variation is relatively small, conservative compression is used
|
450 |
+
return AdaptiveDepthCompressor(
|
451 |
+
cv_thresholds=(0.2, 0.6),
|
452 |
+
compression_quantiles=(1.0, 0.975, 0.95),
|
453 |
+
fg_bg_depth_margin=1.05,
|
454 |
+
**common_params
|
455 |
+
)
|
456 |
+
elif scene_type == "outdoor":
|
457 |
+
# Outdoor scenes: There may be sky, distant mountains, etc., using more aggressive compression
|
458 |
+
return AdaptiveDepthCompressor(
|
459 |
+
cv_thresholds=(0.4, 1.0),
|
460 |
+
compression_quantiles=(0.98, 0.955, 0.93),
|
461 |
+
fg_bg_depth_margin=1.15,
|
462 |
+
**common_params
|
463 |
+
)
|
464 |
+
elif scene_type == "mixed":
|
465 |
+
# Mixed Scene: Balanced Settings
|
466 |
+
return AdaptiveDepthCompressor(
|
467 |
+
cv_thresholds=(0.3, 0.8),
|
468 |
+
compression_quantiles=(0.99, 0.97, 0.95),
|
469 |
+
fg_bg_depth_margin=1.1,
|
470 |
+
**common_params
|
471 |
+
)
|
472 |
+
else: # auto
|
473 |
+
# Automatic mode: Use default settings
|
474 |
+
return AdaptiveDepthCompressor(**common_params)
|
hy3dworld/models/layer_decomposer.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import torch
|
4 |
+
from ..utils import sr_utils, seg_utils, inpaint_utils, layer_utils
|
5 |
+
|
6 |
+
|
7 |
+
class LayerDecomposition():
|
8 |
+
r"""LayerDecomposition is responsible for generating layers in a scene based on input images and masks.
|
9 |
+
It processes foreground objects, background layers, and sky regions using various models.
|
10 |
+
Args:
|
11 |
+
seed (int): Random seed for reproducibility.
|
12 |
+
strength (float): Strength of the layer generation.
|
13 |
+
threshold (int): Threshold for object detection.
|
14 |
+
ratio (float): Ratio for scaling objects.
|
15 |
+
grounding_model (str): Path to the grounding model for object detection.
|
16 |
+
zim_model_config (str): Configuration for the ZIM model.
|
17 |
+
zim_checkpoint (str): Path to the ZIM model checkpoint.
|
18 |
+
inpaint_model (str): Path to the inpainting model.
|
19 |
+
inpaint_fg_lora (str): Path to the LoRA weights for foreground inpainting.
|
20 |
+
inpaint_sky_lora (str): Path to the LoRA weights for sky inpainting.
|
21 |
+
scale (int): Scale factor for super-resolution.
|
22 |
+
device (str): Device to run the model on, either "cuda" or "cpu".
|
23 |
+
dilation_size (int): Size of the dilation for mask processing.
|
24 |
+
cfg_scale (float): Configuration scale for the model.
|
25 |
+
prompt_config (dict): Configuration for prompts used in the model.
|
26 |
+
"""
|
27 |
+
def __init__(self):
|
28 |
+
r"""Initialize the LayerDecomposition class with model paths and parameters."""
|
29 |
+
self.seed = 25
|
30 |
+
self.strength = 1.0
|
31 |
+
self.threshold = 20000
|
32 |
+
self.ratio = 1.5
|
33 |
+
self.grounding_model = "IDEA-Research/grounding-dino-tiny"
|
34 |
+
self.zim_model_config = "vit_l"
|
35 |
+
self.zim_checkpoint = "./ZIM/zim_vit_l_2092" # Add zim anything ckpt here
|
36 |
+
self.inpaint_model = "black-forest-labs/FLUX.1-Fill-dev"
|
37 |
+
self.inpaint_fg_lora = "tencent/HunyuanWorld-1"
|
38 |
+
self.inpaint_sky_lora = "tencent/HunyuanWorld-1"
|
39 |
+
self.scale = 2
|
40 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
41 |
+
self.dilation_size = 80
|
42 |
+
self.cfg_scale = 5.0
|
43 |
+
self.prompt_config = {
|
44 |
+
"indoor": {
|
45 |
+
"positive_prompt": "",
|
46 |
+
"negative_prompt": (
|
47 |
+
"object, table, chair, seat, shelf, sofa, bed, bath, sink,"
|
48 |
+
"ceramic, wood, plant, tree, light, lamp, candle, television, electronics,"
|
49 |
+
"oven, fire, low-resolution, blur, mosaic, people")
|
50 |
+
},
|
51 |
+
"outdoor": {
|
52 |
+
"positive_prompt": "",
|
53 |
+
"negative_prompt": (
|
54 |
+
"object, chair, tree, plant, flower, grass, stone, rock,"
|
55 |
+
"building, hill, house, tower, light, lamp, low-resolution, blur, mosaic, people")
|
56 |
+
}
|
57 |
+
}
|
58 |
+
|
59 |
+
# Load models
|
60 |
+
print("============= now loading models ===============")
|
61 |
+
# super-resolution model
|
62 |
+
self.sr_model = sr_utils.build_sr_model(scale=self.scale, gpu_id=0)
|
63 |
+
print("============= load Super-Resolution models done ")
|
64 |
+
# segmentation model
|
65 |
+
self.zim_predictor = seg_utils.build_zim_model(
|
66 |
+
self.zim_model_config, self.zim_checkpoint, device='cuda:0')
|
67 |
+
self.gd_processor, self.gd_model = seg_utils.build_gd_model(
|
68 |
+
self.grounding_model, device='cuda:0')
|
69 |
+
print("============= load Segmentation models done ====")
|
70 |
+
# panorama inpaint model
|
71 |
+
self.inpaint_fg_model = inpaint_utils.build_inpaint_model(
|
72 |
+
self.inpaint_model,
|
73 |
+
self.inpaint_fg_lora,
|
74 |
+
subfolder="HunyuanWorld-PanoInpaint-Scene",
|
75 |
+
device=0
|
76 |
+
)
|
77 |
+
self.inpaint_sky_model = inpaint_utils.build_inpaint_model(
|
78 |
+
self.inpaint_model,
|
79 |
+
self.inpaint_sky_lora,
|
80 |
+
subfolder="HunyuanWorld-PanoInpaint-Sky",
|
81 |
+
device=0
|
82 |
+
)
|
83 |
+
print("============= load panorama inpaint models done =")
|
84 |
+
|
85 |
+
def __call__(self, input, layer):
|
86 |
+
r"""Generate layers based on the input images and masks.
|
87 |
+
Args:
|
88 |
+
input (str or list): Path to the input JSON file or a list of image information.
|
89 |
+
layer (int): Layer index to process (0 for foreground1, 1 for foreground2,
|
90 |
+
2 for sky).
|
91 |
+
Raises:
|
92 |
+
FileNotFoundError: If the input file does not exist.
|
93 |
+
ValueError: If the input file is not a JSON file or if the layer index is invalid.
|
94 |
+
TypeError: If the input is neither a string nor a list.
|
95 |
+
"""
|
96 |
+
torch.autocast(device_type=self.device,
|
97 |
+
dtype=torch.bfloat16).__enter__()
|
98 |
+
|
99 |
+
# Input handling and validation
|
100 |
+
if isinstance(input, str):
|
101 |
+
if not os.path.exists(input):
|
102 |
+
raise FileNotFoundError(f"Input file {input} does not exist.")
|
103 |
+
if not input.endswith('.json'):
|
104 |
+
raise ValueError("Input file must be a JSON file.")
|
105 |
+
with open(input, "r") as f:
|
106 |
+
img_infos = json.load(f)
|
107 |
+
img_infos = img_infos["output"]
|
108 |
+
elif isinstance(input, list):
|
109 |
+
img_infos = input
|
110 |
+
else:
|
111 |
+
raise TypeError("Input must be a JSON file path or a list.")
|
112 |
+
|
113 |
+
# Processing parameters
|
114 |
+
params = {
|
115 |
+
'scale': self.scale,
|
116 |
+
'seed': self.seed,
|
117 |
+
'threshold': self.threshold,
|
118 |
+
'ratio': self.ratio,
|
119 |
+
'strength': self.strength,
|
120 |
+
'dilation_size': self.dilation_size,
|
121 |
+
'cfg_scale': self.cfg_scale,
|
122 |
+
'prompt_config': self.prompt_config
|
123 |
+
}
|
124 |
+
|
125 |
+
# Layer-specific processing pipelines
|
126 |
+
if layer == 0:
|
127 |
+
layer_utils.remove_fg1_pipeline(
|
128 |
+
img_infos=img_infos,
|
129 |
+
sr_model=self.sr_model,
|
130 |
+
zim_predictor=self.zim_predictor,
|
131 |
+
gd_processor=self.gd_processor,
|
132 |
+
gd_model=self.gd_model,
|
133 |
+
inpaint_model=self.inpaint_fg_model,
|
134 |
+
params=params
|
135 |
+
)
|
136 |
+
elif layer == 1:
|
137 |
+
layer_utils.remove_fg2_pipeline(
|
138 |
+
img_infos=img_infos,
|
139 |
+
sr_model=self.sr_model,
|
140 |
+
zim_predictor=self.zim_predictor,
|
141 |
+
gd_processor=self.gd_processor,
|
142 |
+
gd_model=self.gd_model,
|
143 |
+
inpaint_model=self.inpaint_fg_model,
|
144 |
+
params=params
|
145 |
+
)
|
146 |
+
else:
|
147 |
+
layer_utils.sky_pipeline(
|
148 |
+
img_infos=img_infos,
|
149 |
+
sr_model=self.sr_model,
|
150 |
+
zim_predictor=self.zim_predictor,
|
151 |
+
gd_processor=self.gd_processor,
|
152 |
+
gd_model=self.gd_model,
|
153 |
+
inpaint_model=self.inpaint_sky_model,
|
154 |
+
params=params
|
155 |
+
)
|
hy3dworld/models/pano_generator.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
|
2 |
+
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
|
3 |
+
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
4 |
+
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
|
5 |
+
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
|
6 |
+
# You will be deemed to have recognized and accepted the content of this Agreement,
|
7 |
+
# which is effective immediately.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
|
10 |
+
# and their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
|
14 |
+
|
15 |
+
import torch
|
16 |
+
from transformers import (
|
17 |
+
CLIPTextModel,
|
18 |
+
CLIPTokenizer,
|
19 |
+
T5EncoderModel,
|
20 |
+
T5TokenizerFast,
|
21 |
+
)
|
22 |
+
|
23 |
+
from diffusers.image_processor import VaeImageProcessor
|
24 |
+
from diffusers.models.autoencoders import AutoencoderKL
|
25 |
+
|
26 |
+
from diffusers.models.transformers import FluxTransformer2DModel
|
27 |
+
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
28 |
+
|
29 |
+
from diffusers.utils.torch_utils import randn_tensor
|
30 |
+
|
31 |
+
from .pipelines import FluxPipeline, FluxFillPipeline
|
32 |
+
|
33 |
+
class Text2PanoramaPipelines(FluxPipeline):
|
34 |
+
@torch.no_grad()
|
35 |
+
def __call__(self, prompt, **kwargs):
|
36 |
+
"""Main inpainting call."""
|
37 |
+
return self._call_shared(prompt=prompt, is_inpainting=False, early_steps=3, **kwargs)
|
38 |
+
|
39 |
+
|
40 |
+
class Image2PanoramaPipelines(FluxFillPipeline):
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
scheduler: FlowMatchEulerDiscreteScheduler,
|
44 |
+
vae: AutoencoderKL,
|
45 |
+
text_encoder: CLIPTextModel,
|
46 |
+
tokenizer: CLIPTokenizer,
|
47 |
+
text_encoder_2: T5EncoderModel,
|
48 |
+
tokenizer_2: T5TokenizerFast,
|
49 |
+
transformer: FluxTransformer2DModel,
|
50 |
+
):
|
51 |
+
# Initilization from FluxFillPipeline
|
52 |
+
super().__init__(
|
53 |
+
scheduler=scheduler,
|
54 |
+
vae=vae,
|
55 |
+
text_encoder=text_encoder,
|
56 |
+
tokenizer=tokenizer,
|
57 |
+
text_encoder_2=text_encoder_2,
|
58 |
+
tokenizer_2=tokenizer_2,
|
59 |
+
transformer=transformer,
|
60 |
+
)
|
61 |
+
|
62 |
+
# change some part of initilization
|
63 |
+
self.latent_channels = self.vae.config.latent_channels if getattr(
|
64 |
+
self, "vae", None) else 16
|
65 |
+
|
66 |
+
self.mask_processor = VaeImageProcessor(
|
67 |
+
vae_scale_factor=self.vae_scale_factor * 2,
|
68 |
+
vae_latent_channels=self.latent_channels,
|
69 |
+
do_normalize=False,
|
70 |
+
do_binarize=True,
|
71 |
+
do_convert_grayscale=True,
|
72 |
+
)
|
73 |
+
|
74 |
+
def get_timesteps(self, num_inference_steps, strength, device):
|
75 |
+
# get the original timestep using init_timestep
|
76 |
+
init_timestep = min(num_inference_steps *
|
77 |
+
strength, num_inference_steps)
|
78 |
+
|
79 |
+
t_start = int(max(num_inference_steps - init_timestep, 0))
|
80 |
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
|
81 |
+
if hasattr(self.scheduler, "set_begin_index"):
|
82 |
+
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
83 |
+
|
84 |
+
return timesteps, num_inference_steps - t_start
|
85 |
+
|
86 |
+
def prepare_inpainting_latents(
|
87 |
+
self,
|
88 |
+
batch_size,
|
89 |
+
num_channels_latents,
|
90 |
+
height,
|
91 |
+
width,
|
92 |
+
dtype,
|
93 |
+
device,
|
94 |
+
generator,
|
95 |
+
latents=None,
|
96 |
+
image=None,
|
97 |
+
is_strength_max=True,
|
98 |
+
timestep=None,
|
99 |
+
):
|
100 |
+
r"""
|
101 |
+
Prepares the latents for the Image2PanoramaPipelines.
|
102 |
+
"""
|
103 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
104 |
+
raise ValueError(
|
105 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
106 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
107 |
+
)
|
108 |
+
|
109 |
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
110 |
+
# latent height and width to be divisible by 2.
|
111 |
+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
|
112 |
+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
|
113 |
+
shape = (batch_size, num_channels_latents, height, width)
|
114 |
+
|
115 |
+
# Return the latents if they are already provided
|
116 |
+
if latents is not None:
|
117 |
+
return latents.to(device=device, dtype=dtype), latent_image_ids
|
118 |
+
|
119 |
+
# If no latents are provided, we need to encode the image
|
120 |
+
image = image.to(device=device, dtype=dtype)
|
121 |
+
if image.shape[1] != self.latent_channels:
|
122 |
+
image_latents = self._encode_vae_image(
|
123 |
+
image=image, generator=generator)
|
124 |
+
else:
|
125 |
+
image_latents = image
|
126 |
+
|
127 |
+
# Ensure image_latents has the correct shape
|
128 |
+
if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
|
129 |
+
# expand init_latents for batch_size
|
130 |
+
additional_image_per_prompt = batch_size // image_latents.shape[0]
|
131 |
+
image_latents = torch.cat(
|
132 |
+
[image_latents] * additional_image_per_prompt, dim=0)
|
133 |
+
elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
|
134 |
+
raise ValueError(
|
135 |
+
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
|
136 |
+
)
|
137 |
+
else:
|
138 |
+
image_latents = torch.cat([image_latents], dim=0)
|
139 |
+
# Add noise to the latents
|
140 |
+
noise = randn_tensor(shape, generator=generator,
|
141 |
+
device=device, dtype=dtype)
|
142 |
+
latents = self.scheduler.scale_noise(image_latents, timestep, noise)
|
143 |
+
|
144 |
+
# prepare blended latents
|
145 |
+
latents = torch.cat(
|
146 |
+
[latents, latents[:, :, :, :self.blend_extend]], dim=-1)
|
147 |
+
width_new_blended = latents.shape[-1]
|
148 |
+
latents = self._pack_latents(
|
149 |
+
latents, batch_size, num_channels_latents, height, width_new_blended)
|
150 |
+
# prepare latent image ids
|
151 |
+
latent_image_ids = self._prepare_latent_image_ids(
|
152 |
+
batch_size, height // 2, width_new_blended // 2, device, dtype)
|
153 |
+
|
154 |
+
return latents, latent_image_ids, width_new_blended
|
155 |
+
|
156 |
+
def prepare_blending_latent(
|
157 |
+
self, latents, height, width, batch_size, num_channels_latents, width_new_blended=None
|
158 |
+
):
|
159 |
+
return latents, width_new_blended
|
160 |
+
|
161 |
+
def _apply_blending(
|
162 |
+
self,
|
163 |
+
latents: torch.Tensor,
|
164 |
+
height: int,
|
165 |
+
width_new_blended: int,
|
166 |
+
num_channels_latents: int,
|
167 |
+
batch_size: int,
|
168 |
+
**karwgs,
|
169 |
+
) -> torch.Tensor:
|
170 |
+
r"""Apply horizontal blending to latents."""
|
171 |
+
# Unpack latents for processing
|
172 |
+
latents_unpack = self._unpack_latents(
|
173 |
+
latents, height, width_new_blended*self.vae_scale_factor, self.vae_scale_factor
|
174 |
+
)
|
175 |
+
# Apply blending
|
176 |
+
latents_unpack = self.blend_h(latents_unpack, latents_unpack, self.blend_extend)
|
177 |
+
|
178 |
+
latent_height = 2 * \
|
179 |
+
(int(height) // (self.vae_scale_factor * 2))
|
180 |
+
|
181 |
+
shifting_extend = karwgs.get("shifting_extend", None)
|
182 |
+
if shifting_extend is None:
|
183 |
+
shifting_extend = latents_unpack.size()[-1]//4
|
184 |
+
|
185 |
+
latents_unpack = torch.roll(
|
186 |
+
latents_unpack, shifting_extend, -1)
|
187 |
+
|
188 |
+
# Repack latents after blending
|
189 |
+
latents = self._pack_latents(
|
190 |
+
latents_unpack, batch_size, num_channels_latents, latent_height, width_new_blended)
|
191 |
+
return latents
|
192 |
+
|
193 |
+
def _apply_blending_mask(
|
194 |
+
self,
|
195 |
+
latents: torch.Tensor,
|
196 |
+
height: int,
|
197 |
+
width_new_blended: int,
|
198 |
+
num_channels_latents: int,
|
199 |
+
batch_size: int,
|
200 |
+
**kwargs
|
201 |
+
) -> torch.Tensor:
|
202 |
+
r"""Apply horizontal blending to mask latents."""
|
203 |
+
return self._apply_blending(
|
204 |
+
latents, height, width_new_blended, 80, batch_size, **kwargs
|
205 |
+
)
|
206 |
+
|
207 |
+
def _final_process_latents(
|
208 |
+
self,
|
209 |
+
latents: torch.Tensor,
|
210 |
+
height: int,
|
211 |
+
width_new_blended: int,
|
212 |
+
width: int
|
213 |
+
) -> torch.Tensor:
|
214 |
+
"""Final processing of latents before decoding."""
|
215 |
+
# Unpack and crop to target width
|
216 |
+
latents_unpack = self._unpack_latents(
|
217 |
+
latents, height, width_new_blended * self.vae_scale_factor, self.vae_scale_factor
|
218 |
+
)
|
219 |
+
latents_unpack = self.blend_h(
|
220 |
+
latents_unpack, latents_unpack, self.blend_extend
|
221 |
+
)
|
222 |
+
latents_unpack = latents_unpack[:, :, :, :width // self.vae_scale_factor]
|
223 |
+
|
224 |
+
# Repack for final output
|
225 |
+
return self._pack_latents(
|
226 |
+
latents_unpack,
|
227 |
+
latents.shape[0], # batch size
|
228 |
+
latents.shape[2] // 4, # num_channels_latents
|
229 |
+
height // self.vae_scale_factor,
|
230 |
+
width // self.vae_scale_factor
|
231 |
+
)
|
232 |
+
|
233 |
+
@torch.no_grad()
|
234 |
+
def __call__(self, **kwargs):
|
235 |
+
"""Main inpainting call."""
|
236 |
+
return self._call_shared(is_inpainting=True, early_steps=3, blend_extra_chanel=True, **kwargs)
|