Spaces:

infinity1096
/

UFM

Running on Zero

App Files Files Community

infinity1096 commited on 18 days ago

Commit

c8b42eb

1 Parent(s): 3991736

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +152 -0
LICENSE.txt +58 -0
UniCeption/.gitignore +167 -0
UniCeption/.pre-commit-config.yaml +18 -0
UniCeption/.pylintrc +399 -0
UniCeption/LICENSE +28 -0
UniCeption/README.md +155 -0
UniCeption/examples/models/cosmos/autoencoding.py +48 -0
UniCeption/examples/models/cosmos/example.png +3 -0
UniCeption/examples/models/cosmos/example_decoded.png +3 -0
UniCeption/examples/models/dust3r/convert_dust3r_weights_to_uniception.py +331 -0
UniCeption/examples/models/dust3r/dust3r.py +261 -0
UniCeption/examples/models/dust3r/profile_dust3r.py +47 -0
UniCeption/pyproject.toml +21 -0
UniCeption/scripts/check_dependencies.py +49 -0
UniCeption/scripts/download_checkpoints.py +48 -0
UniCeption/scripts/install_croco_rope.py +62 -0
UniCeption/scripts/prepare_offline_install.py +399 -0
UniCeption/scripts/validate_installation.py +213 -0
UniCeption/setup.py +188 -0
UniCeption/tests/models/encoders/conftest.py +26 -0
UniCeption/tests/models/encoders/test_encoders.py +204 -0
UniCeption/tests/models/encoders/viz_image_encoders.py +294 -0
UniCeption/tests/models/info_sharing/viz_mulit_view_cross_attn_transformers.py +337 -0
UniCeption/uniception/__init__.py +0 -0
UniCeption/uniception/models/encoders/README.md +129 -0
UniCeption/uniception/models/encoders/__init__.py +235 -0
UniCeption/uniception/models/encoders/base.py +157 -0
UniCeption/uniception/models/encoders/cosmos.py +137 -0
UniCeption/uniception/models/encoders/croco.py +457 -0
UniCeption/uniception/models/encoders/dense_rep_encoder.py +344 -0
UniCeption/uniception/models/encoders/dinov2.py +333 -0
UniCeption/uniception/models/encoders/global_rep_encoder.py +115 -0
UniCeption/uniception/models/encoders/image_normalizations.py +35 -0
UniCeption/uniception/models/encoders/list.py +10 -0
UniCeption/uniception/models/encoders/naradio.py +502 -0
UniCeption/uniception/models/encoders/patch_embedder.py +235 -0
UniCeption/uniception/models/encoders/radio.py +367 -0
UniCeption/uniception/models/encoders/utils.py +86 -0
UniCeption/uniception/models/factory/__init__.py +3 -0
UniCeption/uniception/models/factory/dust3r.py +332 -0
UniCeption/uniception/models/info_sharing/README.md +18 -0
UniCeption/uniception/models/info_sharing/__init__.py +35 -0
UniCeption/uniception/models/info_sharing/alternating_attention_transformer.py +944 -0
UniCeption/uniception/models/info_sharing/base.py +116 -0
UniCeption/uniception/models/info_sharing/cross_attention_transformer.py +582 -0
UniCeption/uniception/models/info_sharing/diff_cross_attention_transformer.py +588 -0
UniCeption/uniception/models/info_sharing/global_attention_transformer.py +1107 -0
UniCeption/uniception/models/libs/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,152 @@

+uniflowmatch.egg-info/**
+ufm_model_refine/**
+ufm_model/**
+/home/inf/UniFlowMatch/convert_old_ckpt.py
+checkpoints/**
+# Byte-compiled / optimized / DLL files
+__pycache__/
+**/__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Profiling data
+.prof
+# Folder specific to your needs
+**/tmp/
+**/outputs/skyseg.onnx
+skyseg.onnx
+# pixi environments
+.pixi
+*.egg-info

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,58 @@

+Attribution-NonCommercial 2.5 Generic
+CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
+License
+THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
+BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
+1. Definitions
+"Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
+"Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
+"Licensor" means the individual or entity that offers the Work under the terms of this License.
+"Original Author" means the individual or entity who created the Work.
+"Work" means the copyrightable work of authorship offered under the terms of this License.
+"You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
+2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
+3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
+to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
+to create and reproduce Derivative Works;
+to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
+to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works;
+The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved, including but not limited to the rights set forth in Sections 4(d) and 4(e).
+4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
+You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(c), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(c), as requested.
+You may not exercise any of the rights granted to You in Section 3 above in any manner that is primarily intended for or directed toward commercial advantage or private monetary compensation. The exchange of the Work for other copyrighted works by means of digital file-sharing or otherwise shall not be considered to be intended for or directed toward commercial advantage or private monetary compensation, provided there is no payment of any monetary compensation in connection with the exchange of copyrighted works.
+If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
+For the avoidance of doubt, where the Work is a musical composition:
+Performance Royalties Under Blanket Licenses . Licensor reserves the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work if that performance is primarily intended for or directed toward commercial advantage or private monetary compensation.
+Mechanical Rights and Statutory Royalties . Licensor reserves the exclusive right to collect, whether individually or via a music rights agency or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions), if Your distribution of such cover version is primarily intended for or directed toward commercial advantage or private monetary compensation.
+Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor reserves the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions), if Your public digital performance is primarily intended for or directed toward commercial advantage or private monetary compensation.
+5. Representations, Warranties and Disclaimer
+UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
+6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+7. Termination
+This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
+Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
+8. Miscellaneous
+Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
+Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
+If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
+No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
+This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
+Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.
+Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.
+Creative Commons may be contacted at https://creativecommons.org/ .

UniCeption/.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Local Folders
+checkpoints/
+local/
+reference_data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

UniCeption/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+default_language_version:
+    python: python3
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+  - repo: https://github.com/pre-commit/mirrors-isort
+    rev: 'v5.10.1'
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: '23.3.0'
+    hooks:
+    - id: black

UniCeption/.pylintrc ADDED Viewed

	@@ -0,0 +1,399 @@

+# This Pylint rcfile contains a best-effort configuration to uphold the
+# best-practices and style described in the Google Python style guide:
+#   https://google.github.io/styleguide/pyguide.html
+#
+# Its canonical open-source location is:
+#   https://google.github.io/styleguide/pylintrc
+[MAIN]
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=third_party
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=
+# Pickle collected data for later comparisons.
+persistent=no
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+# Use multiple processes to speed up Pylint.
+jobs=4
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+[MESSAGES CONTROL]
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=R,
+        abstract-method,
+        apply-builtin,
+        arguments-differ,
+        attribute-defined-outside-init,
+        backtick,
+        bad-option-value,
+        basestring-builtin,
+        buffer-builtin,
+        c-extension-no-member,
+        consider-using-enumerate,
+        cmp-builtin,
+        cmp-method,
+        coerce-builtin,
+        coerce-method,
+        delslice-method,
+        div-method,
+        eq-without-hash,
+        execfile-builtin,
+        file-builtin,
+        filter-builtin-not-iterating,
+        fixme,
+        getslice-method,
+        global-statement,
+        hex-method,
+        idiv-method,
+        implicit-str-concat,
+        import-error,
+        import-self,
+        import-star-module-level,
+        input-builtin,
+        intern-builtin,
+        invalid-str-codec,
+        locally-disabled,
+        long-builtin,
+        long-suffix,
+        map-builtin-not-iterating,
+        misplaced-comparison-constant,
+        missing-function-docstring,
+        metaclass-assignment,
+        next-method-called,
+        next-method-defined,
+        no-absolute-import,
+        no-init,  # added
+        no-member,
+        no-name-in-module,
+        no-self-use,
+        nonzero-method,
+        oct-method,
+        old-division,
+        old-ne-operator,
+        old-octal-literal,
+        old-raise-syntax,
+        parameter-unpacking,
+        print-statement,
+        raising-string,
+        range-builtin-not-iterating,
+        raw_input-builtin,
+        rdiv-method,
+        reduce-builtin,
+        relative-import,
+        reload-builtin,
+        round-builtin,
+        setslice-method,
+        signature-differs,
+        standarderror-builtin,
+        suppressed-message,
+        sys-max-int,
+        trailing-newlines,
+        unichr-builtin,
+        unicode-builtin,
+        unnecessary-pass,
+        unpacking-in-except,
+        useless-else-on-loop,
+        useless-suppression,
+        using-cmp-argument,
+        wrong-import-order,
+        xrange-builtin,
+        zip-builtin-not-iterating,
+[REPORTS]
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+# Tells whether to display a full report or only the messages
+reports=no
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+[BASIC]
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
+# Regular expression matching correct method names
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=12
+[TYPECHECK]
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+[FORMAT]
+# Maximum number of characters on a single line.
+max-line-length=80
+# TODO(https://github.com/pylint-dev/pylint/issues/3352): Direct pylint to exempt
+# lines made too long by directives to pytype.
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=(?x)(
+  ^\s*(\#\ )?<?https?://\S+>?$|
+  ^\s*(from\s+\S+\s+)?import\s+.+$)
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+# Maximum number of lines in a module
+max-module-lines=99999
+# String used as indentation unit.  The internal Google style guide mandates 2
+# spaces.  Google's externaly-published style guide says 4, consistent with
+# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
+# projects (like TensorFlow).
+indent-string='  '
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+[MISCELLANEOUS]
+# List of note tags to take in consideration, separated by a comma.
+notes=TODO
+[STRING]
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+[VARIABLES]
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
+[LOGGING]
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging,absl.logging,tensorflow.io.logging
+[SIMILARITIES]
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+# Ignore comments when computing similarities.
+ignore-comments=yes
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+# Ignore imports when computing similarities.
+ignore-imports=no
+[SPELLING]
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+[IMPORTS]
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,
+                   TERMIOS,
+                   Bastion,
+                   rexec,
+                   sets
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant, absl
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+[CLASSES]
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,
+                            class_
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs

UniCeption/LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright (c) 2024, AirLab Stacks
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

UniCeption/README.md ADDED Viewed

	@@ -0,0 +1,155 @@

+# UniCeption
+UniCeption houses modular building blocks for developing and training generalizable perception models for all things related to 3D, 4D, spatial AI and scene understanding.
+It is designed to be flexible and extensible, allowing researchers to easily experiment with different architectures and configurations.
+Please refer to the [Developer Guidelines](#developer-guidelines) for contributing to the project.
+## Installation
+Clone the repository to your local machine by running the following command:
+```bash
+git clone [email protected]:castacks/UniCeption.git
+cd UniCeption
+```
+### Standard Installation
+Install the `uniception` package in development mode by running the following commands:
+```bash
+# Please use Conda or Python Virtual Environment based on your preference
+# For Conda Environment
+conda create --name uniception python=3.12
+conda activate uniception
+# For Python Virtual Environment
+virtualenv uniception
+source uniception/bin/activate
+# Install UniCeption with base dependencies (includes PyTorch)
+pip install -e .
+# Optional: Install with XFormers support
+pip install -e ".[xformers]"
+# Optional: Install with development tools
+pip install -e ".[dev]"
+# Optional: Install all optional dependencies
+pip install -e ".[all]"
+# Setup pre-commit hooks for development
+pre-commit install
+```
+### Optional: CroCo RoPE Extension Installation
+To use CroCo models with the custom RoPE kernel:
+```bash
+# Recommended: Use the console script
+uniception-install-croco
+# Alternative: Set environment variable during installation
+INSTALL_CROCO_ROPE=true pip install -e .
+# Manual compilation (if needed)
+cd uniception/models/libs/croco/curope
+python setup.py build_ext --inplace
+cd ../../../../../
+```
+### Installation Validation and Dependency Checking
+After installation, use these console scripts to validate your setup:
+```bash
+# Validate installation and check dependencies
+uniception-validate
+# Check which optional dependencies are available
+uniception-check-deps
+```
+### Advanced Installation Options
+#### Docker Installation (No Internet Access)
+If you're working in a Docker container that already has Python dependencies installed but no internet access, you can install UniCeption in development mode without triggering network requests:
+```bash
+# Install only the package structure without dependencies
+pip install -e . --no-deps
+```
+**Note:** This command assumes your Docker image already contains all required dependencies (PyTorch, etc.). Use `uniception-validate` after installation to verify all dependencies are available.
+#### Offline Installation
+For environments without internet access:
+```bash
+# 1. On a machine with internet access, prepare offline wheels
+uniception-prepare-offline --output-dir offline_wheels --extras all
+# 2. Copy the offline_wheels directory to your offline environment
+# 3. Run the offline installation
+cd offline_wheels
+INSTALL_CROCO_ROPE=true INSTALL_XFORMERS=true ./install_offline.sh
+```
+#### Downloading Checkpoints
+Download UniCeption format custom checkpoints:
+```bash
+# Download all available checkpoints
+uniception-download-checkpoints
+# Download specific folders only (e.g., encoders and prediction heads)
+uniception-download-checkpoints --folders encoders prediction_heads
+# Specify custom destination
+uniception-download-checkpoints --destination /path/to/checkpoints
+```
+**Available options:**
+- `--folders`: Specify which folders to download. Choices: `encoders`, `info_sharing`, `prediction_heads`, `examples` (default: all folders)
+- `--destination`: Custom destination folder for downloaded checkpoints (default: current directory)
+---
+## Currently Supported Components
+### Encoders
+Please refer to the `uniception/models/encoders` directory for the supported encoders and documentation for adding new encoders. The supported encoders can be listed by running:
+```bash
+python3 -m uniception.models.encoders.list
+```
+---
+## Information Sharing Blocks
+Please refer to the `uniception/models/info_sharing` directory for the supported information sharing blocks.
+---
+## Prediction Heads
+Please refer to the `uniception/models/prediction_heads` directory for the supported prediction heads.
+---
+## Developer Guidelines
+Please follow these guidelines when contributing to UniCeption:
+- **Code Style**: Follow the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) for code style.
+- **Documentation**: Add docstrings to all classes and methods.
+- **Unit Tests**: Add necessary unit tests to the `tests` folder.
+- **Linting**: Run `black` & `isort` on your code before committing. For example, you can run `black . && isort .`.
+Please create a pull request for any changes you make, and ensure that all tests pass before merging.

UniCeption/examples/models/cosmos/autoencoding.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import cv2
+import torch
+from matplotlib import pyplot as plt
+from uniception.models.encoders.base import ViTEncoderInput
+from uniception.models.encoders.cosmos import CosmosEncoder
+from uniception.models.prediction_heads.cosmos import CosmosSingleChannel
+base_path = os.path.dirname(os.path.abspath(__file__))
+encoder = CosmosEncoder(
+    name="cosmos",
+    patch_size=8,
+    pretrained_checkpoint_path=os.path.join(
+        base_path, "../../../checkpoints/encoders/cosmos/Cosmos-Tokenizer-CI8x8/encoder.pth"
+    ),
+)
+decoder = CosmosSingleChannel(
+    patch_size=8,
+    pretrained_checkpoint_path=os.path.join(base_path, "../../../checkpoints/prediction_heads/cosmos/decoder_8.pth"),
+)
+example_image = cv2.imread(os.path.join(base_path, "./example.png"))
+example_image = cv2.cvtColor(example_image, cv2.COLOR_BGR2RGB)
+example_tensor = torch.tensor(example_image).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+example_tensor = example_tensor * 2.0 - 1.0  # Normalize to [-1, 1] according to the COSMOS Encoder
+encoded_latent = encoder(ViTEncoderInput("cosmos", example_tensor)).features
+decoded_image = decoder(encoded_latent)
+decoded_image = (decoded_image + 1.0) / 2.0  # Denormalize to [0, 1] for visualization
+# plot the original and decoded images
+plt.figure(figsize=(10, 5))
+plt.subplot(1, 2, 1)
+plt.imshow(example_image)
+plt.title("Original Image")
+plt.axis("off")
+plt.subplot(1, 2, 2)
+plt.imshow(decoded_image.squeeze().detach().permute(1, 2, 0).cpu().numpy())
+plt.title("Decoded Image")
+plt.axis("off")
+plt.savefig(os.path.join(base_path, "example_decoded.png"))

UniCeption/examples/models/cosmos/example.png ADDED Viewed

Git LFS Details

SHA256: 5e6ee5528f76e5c0794e2708d688877b0f06f2139a11e883a3832ad57f19f89c
Pointer size: 131 Bytes
Size of remote file: 711 kB

UniCeption/examples/models/cosmos/example_decoded.png ADDED Viewed

Git LFS Details

SHA256: f948b50b602260352e14fca5f51999f01bd98b8e167dd1595451418380eaed21
Pointer size: 131 Bytes
Size of remote file: 348 kB

UniCeption/examples/models/dust3r/convert_dust3r_weights_to_uniception.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+This file extracts the cross-attention transformer & prediction head weights from dust3r checkpoints into uniception format.
+Special Notice: dust3r have changed their released weights before/after CVPR, and
+uniception uses the checkpoint BEFORE CVPR (they perform better). So please make sure you are not converting
+the newly downloaded weights. Consult Yuchen and Nikhil on where to find the old weights.
+"""
+import argparse
+import os
+import torch
+from torch import nn
+from uniception.models.info_sharing.cross_attention_transformer import MultiViewCrossAttentionTransformerIFR
+from uniception.models.prediction_heads.dpt import DPTFeature, DPTRegressionProcessor
+from uniception.models.prediction_heads.linear import LinearFeature
+def extract_cross_attention_weights(checkpoint_path, output_folder, output_filename):
+    "Extract the UniCeption format cross attention weights from the original CroCoV2/DUSt3R/MASt3R checkpoints."
+    # Load checkpoint
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    # Filter the relevant keys for the cross attention model and duplicate if necessary
+    filtered_checkpoint = checkpoint["model"]
+    filtered_checkpoint = {k: v for k, v in filtered_checkpoint.items() if "dec" in k}
+    duplicate_checkpoint = {}
+    if not any(k.startswith("dec_blocks2") for k in filtered_checkpoint):
+        print("Duplicating dec_blocks to dec_blocks2")
+        for key, value in filtered_checkpoint.items():
+            if key.startswith("dec_blocks"):
+                duplicate_checkpoint[key.replace("dec_blocks", "dec_blocks2")] = value
+        filtered_checkpoint = {**filtered_checkpoint, **duplicate_checkpoint}
+    new_checkpoint = {}
+    for k, v in filtered_checkpoint.items():
+        if "decoder_embed" in k:
+            new_key = k.replace("decoder_embed", "proj_embed")
+            new_checkpoint[new_key] = v
+        elif "dec_blocks." in k:
+            new_key = k.replace("dec_blocks.", "multi_view_branches.0.")
+            new_checkpoint[new_key] = v
+        elif "dec_blocks2." in k:
+            new_key = k.replace("dec_blocks2.", "multi_view_branches.1.")
+            new_checkpoint[new_key] = v
+        elif "dec_norm" in k:
+            new_key = k.replace("dec_norm", "norm")
+            new_checkpoint[new_key] = v
+    # Init model
+    model = MultiViewCrossAttentionTransformerIFR(
+        name="MV-CAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[5, 8],
+        norm_intermediate=False,
+    )
+    # Load new checkpoint
+    print(model.load_state_dict(new_checkpoint))
+    # Save the checkpoint
+    save_checkpoint = {}
+    save_checkpoint["model"] = model.state_dict()
+    os.makedirs(os.path.join(output_folder, "cross_attn_transformer"), exist_ok=True)
+    save_path = os.path.join(output_folder, "cross_attn_transformer", output_filename)
+    torch.save(save_checkpoint, save_path)
+def extract_dust3r_dpt_checkpoints(checkpoint_path, output_folder, output_filename):
+    "Extract the UniCeption format DPT head weights from the original DUSt3R checkpoint."
+    source_ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    for head in ["head1", "head2"]:
+        # Extract head weights from the checkpoint
+        dpt_head_weights = {k: v for k, v in source_ckpt["model"].items() if k.startswith(f"downstream_{head}")}
+        dpt_head_weights = {k.replace(f"downstream_{head}.dpt.", ""): v for k, v in dpt_head_weights.items()}
+        dpt_feature_weights = {k: v for k, v in dpt_head_weights.items() if not (k.startswith("head"))}
+        # Construct the DPTFeature module and load the weights
+        dpt = DPTFeature(
+            patch_size=16,
+            hooks=[0, 1, 2, 3],
+            input_feature_dims=[1024, 768, 768, 768],
+            layer_dims=[96, 192, 384, 768],
+            feature_dim=256,
+            use_bn=False,
+            output_width_ratio=1,
+        )
+        dpt.load_state_dict(dpt_feature_weights, strict=True)
+        # Construct the dpt processor module and load the weights
+        dpt_processor_weights = {k.replace("head.", ""): v for k, v in dpt_head_weights.items() if k.startswith("head")}
+        # Replace the keys according to:
+        key_replace_dict = {
+            "0.weight": "conv1.weight",
+            "0.bias": "conv1.bias",
+            "2.weight": "conv2.0.weight",
+            "2.bias": "conv2.0.bias",
+            "4.weight": "conv2.2.weight",
+            "4.bias": "conv2.2.bias",
+        }
+        dpt_processor_weights = {key_replace_dict.get(k, k): v for k, v in dpt_processor_weights.items()}
+        dpt_reg_processor = DPTRegressionProcessor(input_feature_dim=256, output_dim=4, hidden_dims=[128, 128])
+        dpt_reg_processor.load_state_dict(dpt_processor_weights, strict=True)
+        # Save the state_dicts of the DPTFeature and DPTRegressionProcessor
+        dpt_feature_path = os.path.join(output_folder, "dpt_feature_head", output_filename + f"_feature_{head}.pth")
+        dpt_reg_processor_path = os.path.join(
+            output_folder, "dpt_reg_processor", output_filename + f"_reg_processor{head[-1]}.pth"
+        )
+        os.makedirs(os.path.dirname(dpt_feature_path), exist_ok=True)
+        os.makedirs(os.path.dirname(dpt_reg_processor_path), exist_ok=True)
+        torch.save({"model": dpt.state_dict()}, dpt_feature_path)
+        torch.save({"model": dpt_reg_processor.state_dict()}, dpt_reg_processor_path)
+def extract_dust3r_linear_checkpoints(checkpoint_path, output_folder, output_filename):
+    "Extract the UniCeption format linear head weights from the original DUSt3R checkpoint."
+    test_linear_to_conv()
+    source_ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    for head in ["head1", "head2"]:
+        linear_head_params = {k: v for k, v in source_ckpt["model"].items() if k.startswith(f"downstream_{head}")}
+        linear_head_params = {k.replace(f"downstream_{head}.proj.", ""): v for k, v in linear_head_params.items()}
+        assert set(linear_head_params.keys()) == {"weight", "bias"}
+        input_feature_dim = 768
+        output_dim = 4
+        patch_size = 16
+        linear = nn.Linear(input_feature_dim, output_dim * patch_size * patch_size, bias=True)
+        linear.load_state_dict(linear_head_params, strict=True)
+        conv_layer = linear_to_conv2d(linear)
+        linear_feature = LinearFeature(input_feature_dim, 4, patch_size)
+        linear_feature.linear.load_state_dict(conv_layer.state_dict(), strict=True)
+        linear_feature_path = os.path.join(
+            output_folder, "linear_feature_head", output_filename + f"_feature_{head}.pth"
+        )
+        os.makedirs(os.path.dirname(linear_feature_path), exist_ok=True)
+        torch.save({"model": linear_feature.state_dict()}, linear_feature_path)
+def extract_mast3r_dpt_checkpoints(checkpoint_path, output_folder, output_filename):
+    "Extract the UniCeption format DPT head weights from the original MASt3R checkpoint."
+    source_ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    for head in ["head1", "head2"]:
+        dpt_head = {k: v for k, v in source_ckpt["model"].items() if k.startswith(f"downstream_{head}")}
+        dpt_head = {k.replace(f"downstream_{head}.", ""): v for k, v in dpt_head.items()}
+        dpt_head = {k.replace("dpt.", ""): v for k, v in dpt_head.items()}
+        dpt_feature_weights = {
+            k: v for k, v in dpt_head.items() if not (k.startswith("head") or k.startswith("head_local_features"))
+        }
+        dpt = DPTFeature(
+            patch_size=16,
+            hooks=[0, 1, 2, 3],
+            input_feature_dims=[1024, 768, 768, 768],
+            layer_dims=[96, 192, 384, 768],
+            feature_dim=256,
+            use_bn=False,
+            output_width_ratio=1,
+        )
+        dpt.load_state_dict(dpt_feature_weights, strict=True)
+        dpt_processor_weights = {
+            k.replace("head.", ""): v
+            for k, v in dpt_head.items()
+            if (k.startswith("head") and not k.startswith("head_local_features"))
+        }
+        # Replace the keys according to:
+        key_replace_dict = {
+            "0.weight": "conv1.weight",
+            "0.bias": "conv1.bias",
+            "2.weight": "conv2.0.weight",
+            "2.bias": "conv2.0.bias",
+            "4.weight": "conv2.2.weight",
+            "4.bias": "conv2.2.bias",
+        }
+        dpt_processor_weights = {key_replace_dict.get(k, k): v for k, v in dpt_processor_weights.items()}
+        dpt_reg_processor = DPTRegressionProcessor(input_feature_dim=256, output_dim=4, hidden_dims=[128, 128])
+        dpt_reg_processor.load_state_dict(dpt_processor_weights, strict=True)
+        # Save the state_dicts of the DPTFeature and DPTRegressionProcessor
+        dpt_feature_path = os.path.join(output_folder, "dpt_feature_head", output_filename + f"_feature_{head}.pth")
+        dpt_reg_processor_path = os.path.join(
+            output_folder, "dpt_reg_processor", output_filename + f"_reg_processor{head[-1]}.pth"
+        )
+        os.makedirs(os.path.dirname(dpt_feature_path), exist_ok=True)
+        os.makedirs(os.path.dirname(dpt_reg_processor_path), exist_ok=True)
+        torch.save({"model": dpt.state_dict()}, dpt_feature_path)
+        torch.save({"model": dpt_reg_processor.state_dict()}, dpt_reg_processor_path)
+def linear_to_conv2d(linear_layer):
+    """
+    Converts a nn.Linear layer to an equivalent nn.Conv2d layer with a 1x1 kernel.
+    Parameters:
+    - linear_layer (nn.Linear): The Linear layer to convert.
+    Returns:
+    - conv_layer (nn.Conv2d): The equivalent Conv2d layer.
+    """
+    # Extract in_features and out_features from the Linear layer
+    in_features = linear_layer.in_features
+    out_features = linear_layer.out_features
+    bias = linear_layer.bias is not None
+    # Create a Conv2d layer with a 1x1 kernel
+    conv_layer = nn.Conv2d(
+        in_channels=in_features, out_channels=out_features, kernel_size=1, stride=1, padding=0, bias=bias
+    )
+    # Reshape Linear weights to match Conv2d weights
+    conv_weight = linear_layer.weight.data.view(out_features, in_features, 1, 1).clone()
+    conv_layer.weight.data = conv_weight
+    # Copy bias if it exists
+    if bias:
+        conv_layer.bias.data = linear_layer.bias.data.clone()
+    return conv_layer
+def test_linear_to_conv():
+    "Test the linear_to_conv2d function."
+    batch_size = 4
+    height = 16
+    width = 24
+    in_channels = 3
+    out_channels = 5
+    # Sample input tensor in BHWC format
+    x_linear = torch.randn(batch_size, height, width, in_channels)
+    # Define Linear layer
+    linear_layer = nn.Linear(in_channels, out_channels)
+    output_linear = linear_layer(x_linear)
+    # Transpose input tensor to BCHW format for Conv2d
+    x_conv = x_linear.permute(0, 3, 1, 2)
+    # Define Conv2d layer
+    conv_layer = linear_to_conv2d(linear_layer)
+    # Get Conv2d output and transpose back to BHWC format
+    output_conv = conv_layer(x_conv).permute(0, 2, 3, 1)
+    # Verify that outputs are the same
+    assert torch.allclose(output_linear, output_conv, atol=1e-6)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract dust3r checkpoints to uniception format")
+    parser.add_argument(
+        "-dcf", "--dust3r_checkpoints_folder", type=str, required=True, help="Path to the dust3r checkpoints folder"
+    )
+    parser.add_argument("-of", "--output_folder", type=str, required=True, help="Path to the output folder")
+    args = parser.parse_args()
+    output_folder = args.output_folder
+    info_sharing_output_folder = os.path.join(output_folder, "info_sharing")
+    pred_head_output_folder = os.path.join(output_folder, "prediction_heads")
+    os.makedirs(output_folder, exist_ok=True)
+    os.makedirs(info_sharing_output_folder, exist_ok=True)
+    os.makedirs(pred_head_output_folder, exist_ok=True)
+    # Extract croco checkpoint
+    print("Extracting CroCo checkpoint...")
+    croco_ckpt_filepath = os.path.join(args.dust3r_checkpoints_folder, "CroCo_V2_ViTLarge_BaseDecoder.pth")
+    extract_cross_attention_weights(
+        croco_ckpt_filepath, info_sharing_output_folder, "Two_View_Cross_Attention_Transformer_CroCo.pth"
+    )
+    # Extract dust3r 224 linear checkpoint
+    print("Extracting DUSt3R 224 linear checkpoint...")
+    dust3r_ckpt_filepath = os.path.join(args.dust3r_checkpoints_folder, "DUSt3R_ViTLarge_BaseDecoder_224_linear.pth")
+    extract_cross_attention_weights(
+        dust3r_ckpt_filepath, info_sharing_output_folder, "Two_View_Cross_Attention_Transformer_DUSt3R_224_linear.pth"
+    )
+    extract_dust3r_linear_checkpoints(dust3r_ckpt_filepath, pred_head_output_folder, "DUSt3R_224_linear")
+    # Extract dust3r 512 linear checkpoint
+    print("Extracting DUSt3R 512 linear checkpoint...")
+    dust3r_ckpt_filepath = os.path.join(args.dust3r_checkpoints_folder, "DUSt3R_ViTLarge_BaseDecoder_512_linear.pth")
+    extract_cross_attention_weights(
+        dust3r_ckpt_filepath, info_sharing_output_folder, "Two_View_Cross_Attention_Transformer_DUSt3R_512_linear.pth"
+    )
+    extract_dust3r_linear_checkpoints(dust3r_ckpt_filepath, pred_head_output_folder, "DUSt3R_512_linear")
+    # Extract dust3r 512 dpt checkpoint
+    print("Extracting DUSt3R 512 dpt checkpoint...")
+    dust3r_ckpt_filepath = os.path.join(args.dust3r_checkpoints_folder, "DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth")
+    extract_cross_attention_weights(
+        dust3r_ckpt_filepath, info_sharing_output_folder, "Two_View_Cross_Attention_Transformer_DUSt3R_512_dpt.pth"
+    )
+    extract_dust3r_dpt_checkpoints(dust3r_ckpt_filepath, pred_head_output_folder, "DUSt3R_512_dpt")
+    # Extract mast3r 512 dpt checkpoint
+    print("Extracting MASt3R 512 dpt checkpoint...")
+    mast3r_ckpt_path = os.path.join(
+        args.dust3r_checkpoints_folder, "MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+    )
+    extract_cross_attention_weights(
+        mast3r_ckpt_path, info_sharing_output_folder, "Two_View_Cross_Attention_Transformer_MASt3R_512_dpt.pth"
+    )
+    extract_mast3r_dpt_checkpoints(mast3r_ckpt_path, pred_head_output_folder, "MASt3R_512_dpt")

UniCeption/examples/models/dust3r/dust3r.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Initalizing Pre-trained DUSt3R using UniCeption
+"""
+import argparse
+import os
+from io import BytesIO
+import numpy as np
+import requests
+import rerun as rr
+import torch
+from PIL import Image
+from uniception.models.factory import DUSt3R
+from uniception.utils.viz import script_add_rerun_args
+def get_model_configurations_and_checkpoints():
+    """
+    Get different DUSt3R model configurations and paths to refactored checkpoints.
+    Returns:
+        Tuple[List[str], dict]: A tuple containing the model configurations and paths to refactored checkpoints.
+    """
+    # Initialize model configurations
+    model_configurations = ["dust3r_224_linear", "dust3r_512_linear", "dust3r_512_dpt", "dust3r_512_dpt_mast3r"]
+    # Get paths to pretrained checkpoints
+    current_file_path = os.path.abspath(__file__)
+    relative_checkpoint_path = os.path.join(os.path.dirname(current_file_path), "../../../checkpoints")
+    # Initialize model configurations
+    model_to_checkpoint_path = {
+        "dust3r_512_dpt": {
+            "encoder": f"{relative_checkpoint_path}/encoders/CroCo_Encoder_512_DUSt3R_dpt.pth",
+            "info_sharing": f"{relative_checkpoint_path}/info_sharing/cross_attn_transformer/Two_View_Cross_Attention_Transformer_DUSt3R_512_dpt.pth",
+            "feature_head": [
+                f"{relative_checkpoint_path}/prediction_heads/dpt_feature_head/DUSt3R_512_dpt_feature_head1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/dpt_feature_head/DUSt3R_512_dpt_feature_head2.pth",
+            ],
+            "regressor": [
+                f"{relative_checkpoint_path}/prediction_heads/dpt_reg_processor/DUSt3R_512_dpt_reg_processor1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/dpt_reg_processor/DUSt3R_512_dpt_reg_processor2.pth",
+            ],
+            "ckpt_path": f"{relative_checkpoint_path}/examples/original_dust3r/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth",
+        },
+        "dust3r_512_dpt_mast3r": {
+            "encoder": f"{relative_checkpoint_path}/encoders/CroCo_Encoder_512_MASt3R.pth",
+            "info_sharing": f"{relative_checkpoint_path}/info_sharing/cross_attn_transformer/Two_View_Cross_Attention_Transformer_MASt3R_512_dpt.pth",
+            "feature_head": [
+                f"{relative_checkpoint_path}/prediction_heads/dpt_feature_head/MASt3R_512_dpt_feature_head1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/dpt_feature_head/MASt3R_512_dpt_feature_head2.pth",
+            ],
+            "regressor": [
+                f"{relative_checkpoint_path}/prediction_heads/dpt_reg_processor/MASt3R_512_dpt_reg_processor1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/dpt_reg_processor/MASt3R_512_dpt_reg_processor2.pth",
+            ],
+            "ckpt_path": f"{relative_checkpoint_path}/examples/original_dust3r/DUSt3R_ViTLarge_BaseDecoder_512_dpt_mast3r.pth",
+        },
+        "dust3r_512_linear": {
+            "encoder": f"{relative_checkpoint_path}/encoders/CroCo_Encoder_512_DUSt3R_linear.pth",
+            "info_sharing": f"{relative_checkpoint_path}/info_sharing/cross_attn_transformer/Two_View_Cross_Attention_Transformer_DUSt3R_512_linear.pth",
+            "feature_head": [
+                f"{relative_checkpoint_path}/prediction_heads/linear_feature_head/DUSt3R_512_linear_feature_head1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/linear_feature_head/DUSt3R_512_linear_feature_head2.pth",
+            ],
+            "regressor": None,
+            "ckpt_path": f"{relative_checkpoint_path}/examples/original_dust3r/DUSt3R_ViTLarge_BaseDecoder_512_linear.pth",
+        },
+        "dust3r_224_linear": {
+            "encoder": f"{relative_checkpoint_path}/encoders/CroCo_Encoder_224_DUSt3R_linear.pth",
+            "info_sharing": f"{relative_checkpoint_path}/info_sharing/cross_attn_transformer/Two_View_Cross_Attention_Transformer_DUSt3R_224_linear.pth",
+            "feature_head": [
+                f"{relative_checkpoint_path}/prediction_heads/linear_feature_head/DUSt3R_224_linear_feature_head1.pth",
+                f"{relative_checkpoint_path}/prediction_heads/linear_feature_head/DUSt3R_224_linear_feature_head2.pth",
+            ],
+            "regressor": None,
+            "ckpt_path": f"{relative_checkpoint_path}/examples/original_dust3r/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth",
+        },
+    }
+    return model_configurations, model_to_checkpoint_path
+def get_parser():
+    "Argument parser for the script."
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    # Parse arguments
+    parser = get_parser()
+    script_add_rerun_args(parser)  # Options: --addr
+    args = parser.parse_args()
+    # Set up Rerun for visualization
+    if args.viz:
+        rr.script_setup(args, f"UniCeption_DUSt3R_Inference")
+        rr.set_time("stable_time", sequence=0)
+    # the reference data are collected under this setting.
+    # may use (False, "high") to test the relative error at TF32 precision
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.set_float32_matmul_precision("highest")
+    # Get paths to pretrained checkpoints
+    current_file_path = os.path.abspath(__file__)
+    relative_checkpoint_path = os.path.join(os.path.dirname(current_file_path), "../../../checkpoints")
+    model_configurations, model_to_checkpoint_path = get_model_configurations_and_checkpoints()
+    MODEL_TO_VERIFICATION_PATH = {
+        "dust3r_512_dpt": {
+            "head_output": os.path.join(
+                os.path.dirname(current_file_path),
+                "../../../reference_data/dust3r_pre_cvpr",
+                "DUSt3R_512_dpt",
+                "03_head_output.npz",
+            )
+        },
+        "dust3r_512_dpt_mast3r": {
+            "head_output": os.path.join(
+                os.path.dirname(current_file_path),
+                "../../../reference_data/dust3r_pre_cvpr",
+                "MASt3R_512_dpt",
+                "03_head_output.npz",
+            )
+        },
+        "dust3r_512_linear": {
+            "head_output": os.path.join(
+                os.path.dirname(current_file_path),
+                "../../../reference_data/dust3r_pre_cvpr",
+                "DUSt3R_512_linear",
+                "03_head_output.npz",
+            )
+        },
+        "dust3r_224_linear": {
+            "head_output": os.path.join(
+                os.path.dirname(current_file_path),
+                "../../../reference_data/dust3r_pre_cvpr",
+                "DUSt3R_224_linear",
+                "03_head_output.npz",
+            )
+        },
+    }
+    # Test different DUSt3R models using UniCeption modules
+    for model_name in model_configurations:
+        dust3r_model = DUSt3R(
+            name=model_name,
+            img_size=(512, 512) if "512" in model_name else (224, 224),
+            patch_embed_cls="PatchEmbedDust3R",
+            pred_head_type="linear" if "linear" in model_name else "dpt",
+            pretrained_checkpoint_path=model_to_checkpoint_path[model_name]["ckpt_path"],
+            # pretrained_encoder_checkpoint_path=model_to_checkpoint_path[model_name]["encoder"],
+            # pretrained_info_sharing_checkpoint_path=model_to_checkpoint_path[model_name]["info_sharing"],
+            # pretrained_pred_head_checkpoint_paths=model_to_checkpoint_path[model_name]["feature_head"],
+            # pretrained_pred_head_regressor_checkpoint_paths=model_to_checkpoint_path[model_name]["regressor"],
+            # override_encoder_checkpoint_attributes=True,
+        )
+        print("DUSt3R model initialized successfully!")
+        # Initalize device
+        if torch.cuda.is_available():
+            device = "cuda:0"
+        else:
+            device = "cpu"
+        dust3r_model.to(device)
+        # Initalize two example images
+        img0_url = (
+            "https://raw.githubusercontent.com/naver/croco/d3d0ab2858d44bcad54e5bfc24f565983fbe18d9/assets/Chateau1.png"
+        )
+        img1_url = (
+            "https://raw.githubusercontent.com/naver/croco/d3d0ab2858d44bcad54e5bfc24f565983fbe18d9/assets/Chateau2.png"
+        )
+        response = requests.get(img0_url)
+        img0 = Image.open(BytesIO(response.content))
+        response = requests.get(img1_url)
+        img1 = Image.open(BytesIO(response.content))
+        img0_tensor = torch.from_numpy(np.array(img0))[..., :3].permute(2, 0, 1).unsqueeze(0).float() / 255
+        img1_tensor = torch.from_numpy(np.array(img1))[..., :3].permute(2, 0, 1).unsqueeze(0).float() / 255
+        # Normalize images according to DUSt3R's normalization
+        img0_tensor = (img0_tensor - 0.5) / 0.5
+        img1_tensor = (img1_tensor - 0.5) / 0.5
+        img_tensor = torch.cat((img0_tensor, img1_tensor), dim=0).to(device)
+        # Run a forward pass
+        view1 = {"img": img_tensor, "instance": [0, 1], "data_norm_type": "dust3r"}
+        view2 = {"img": view1["img"][[1, 0]].clone().to(device), "instance": [1, 0], "data_norm_type": "dust3r"}
+        res1, res2 = dust3r_model(view1, view2)
+        print("Forward pass completed successfully!")
+        # Automatically test the results against the reference result from vanilla dust3r code if they exist
+        reference_output_path = MODEL_TO_VERIFICATION_PATH[model_name]["head_output"]
+        if os.path.exists(reference_output_path):
+            reference_output_data = np.load(reference_output_path)
+            # Check against the reference output
+            check_dict = {
+                "head1_pts3d": (
+                    res1["pts3d"].detach().cpu().numpy(),
+                    reference_output_data["head1_pts3d"],
+                ),
+                "head2_pts3d": (
+                    res2["pts3d_in_other_view"].detach().cpu().numpy(),
+                    reference_output_data["head2_pts3d"],
+                ),
+                "head1_conf": (
+                    res1["conf"].detach().squeeze(-1).cpu().numpy(),
+                    reference_output_data["head1_conf"],
+                ),
+                "head2_conf": (
+                    res2["conf"].detach().squeeze(-1).cpu().numpy(),
+                    reference_output_data["head2_conf"],
+                ),
+            }
+            compute_abs_and_rel_error = lambda x, y: (np.abs(x - y).max(), np.linalg.norm(x - y) / np.linalg.norm(x))
+            print(f"===== Checking for {model_name} model =====")
+            for key, (output, reference) in check_dict.items():
+                abs_error, rel_error = compute_abs_and_rel_error(output, reference)
+                print(f"{key} abs_error: {abs_error}, rel_error: {rel_error}")
+                assert abs_error < 1e-2 and rel_error < 1e-3, f"Error in {key} output"
+        points1 = res1["pts3d"][0].detach().cpu().numpy()
+        points2 = res2["pts3d_in_other_view"][0].detach().cpu().numpy()
+        conf_mask1 = res1["conf"][0].squeeze(-1).detach().cpu().numpy() > 3.0
+        conf_mask2 = res2["conf"][0].squeeze(-1).detach().cpu().numpy() > 3.0
+        if args.viz:
+            rr.log(f"{model_name}", rr.ViewCoordinates.RDF, static=True)
+            filtered_pts3d1 = points1[conf_mask1]
+            filtered_pts3d1_colors = np.array(img0)[..., :3][conf_mask1] / 255
+            filtered_pts3d2 = points2[conf_mask2]
+            filtered_pts3d2_colors = np.array(img1)[..., :3][conf_mask2] / 255
+            rr.log(
+                f"{model_name}/view1",
+                rr.Points3D(
+                    positions=filtered_pts3d1.reshape(-1, 3),
+                    colors=filtered_pts3d1_colors.reshape(-1, 3),
+                ),
+            )
+            rr.log(
+                f"{model_name}/view2",
+                rr.Points3D(
+                    positions=filtered_pts3d2.reshape(-1, 3),
+                    colors=filtered_pts3d2_colors.reshape(-1, 3),
+                ),
+            )
+            print(
+                "Visualizations logged to Rerun: rerun+http://127.0.0.1:<rr-port>/proxy."
+                "For example, to spawn viewer: rerun --connect rerun+http://127.0.0.1:<rr-port>/proxy"
+                "Replace <rr-port> with the actual port."
+            )

UniCeption/examples/models/dust3r/profile_dust3r.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from dust3r import get_model_configurations_and_checkpoints
+from uniception.models.factory import DUSt3R
+from uniception.utils.profile import benchmark_torch_function
+if __name__ == "__main__":
+    # Get model configurations and checkpoints
+    model_configurations, model_to_checkpoint_path = get_model_configurations_and_checkpoints()
+    # Test different DUSt3R models using UniCeption modules
+    for model_name in model_configurations:
+        dust3r_model = DUSt3R(
+            name=model_name,
+            img_size=(512, 512) if "512" in model_name else (224, 224),
+            patch_embed_cls="PatchEmbedDust3R",
+            pred_head_type="linear" if "linear" in model_name else "dpt",
+            pretrained_checkpoint_path=model_to_checkpoint_path[model_name]["ckpt_path"],
+        )
+        print(f"DUSt3R model ({model_name}) initialized successfully!")
+        # Initialize device
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        dust3r_model.to(device)
+        print(f"Running on {device}")
+        # Generate random input tensors
+        img_size = (512, 512) if "512" in model_name else (224, 224)
+        batch_sizes = [1, 2, 4, 8]
+        for batch_size in batch_sizes:
+            # Prepare input views
+            view1_instances = range(batch_size)
+            view1_img_tensor = torch.randn(batch_size, 3, *img_size).to(device)
+            view1 = {"img": view1_img_tensor, "instance": view1_instances, "data_norm_type": "dust3r"}
+            view2_instances = range(batch_size)
+            view2_instances = [id + batch_size for id in view2_instances]
+            view2_img_tensor = torch.randn(batch_size, 3, *img_size).to(device)
+            view2 = {"img": view2_img_tensor, "instance": view2_instances, "data_norm_type": "dust3r"}
+            # Benchmark the forward pass of the model
+            with torch.no_grad():
+                with torch.autocast("cuda", enabled=True):
+                    execution_time = benchmark_torch_function(dust3r_model, view1, view2)
+                    print(
+                        f"\033[92mForward pass for {model_name}, batch size : {batch_size} completed in {execution_time:.3f} milliseconds\033[0m"
+                    )

UniCeption/pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[tool.black]
+line-length = 120
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | cuda
+  | dist
+)/
+'''
+[tool.isort]
+profile = "black"
+line_length = 120

UniCeption/scripts/check_dependencies.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python3
+"""
+Console script to check UniCeption dependencies.
+"""
+import sys
+from pathlib import Path
+# Add the parent directory to the path to import uniception
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def check_dependencies():
+    """Check if optional dependencies are available."""
+    try:
+        import torch
+        print(f"PyTorch version: {torch.__version__}")
+        if torch.cuda.is_available():
+            print(f"CUDA available: {torch.version.cuda}")
+        else:
+            print("CUDA not available")
+    except ImportError:
+        print("PyTorch not installed")
+    try:
+        import xformers
+        print(f"XFormers version: {xformers.__version__}")
+    except ImportError:
+        print("XFormers not installed")
+    try:
+        from uniception.models.libs.croco.curope import cuRoPE2D
+        print("CroCo RoPE extension available")
+    except ImportError:
+        print("CroCo RoPE extension not available")
+def main():
+    """Main entry point for the check dependencies script."""
+    print("Checking UniCeption Dependencies...")
+    print("=" * 40)
+    check_dependencies()
+if __name__ == "__main__":
+    main()

UniCeption/scripts/download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"Download the UniCeption format checkpoints from the AirLab Data Server"
+import argparse
+import os
+from minio import Minio
+from minio.error import S3Error
+from tqdm import tqdm
+def main():
+    parser = argparse.ArgumentParser(description="Download UniCeption format checkpoints from AirLab Data Server")
+    parser.add_argument(
+        "--folders",
+        nargs="+",
+        default=["encoders", "info_sharing", "prediction_heads", "examples"],
+        help="List of folders to download (default: all folders). Choices: encoders, info_sharing, prediction_heads, examples",
+    )
+    parser.add_argument("--destination", type=str, default="./", help="Destination folder for downloaded checkpoints")
+    args = parser.parse_args()
+    access_key = "bT79gQYtfhpxFIitlpns"
+    secret_key = "g7mSvUJ5k2a9mKv9IbhwXmUQjQX52MLwulhW9ONO"
+    client = Minio("airlab-share-02.andrew.cmu.edu:9000", access_key=access_key, secret_key=secret_key, secure=True)
+    bucket_name = "uniception"
+    def download_folder(folder_name, bucket_name, client, destination_folder):
+        folder_name = f"checkpoints/{folder_name}/"
+        objects = client.list_objects(bucket_name, prefix=folder_name, recursive=True)
+        for obj in tqdm(objects, desc=f"Downloading {folder_name}"):
+            destination_file = os.path.join(destination_folder, obj.object_name)
+            if not os.path.exists(destination_file):
+                os.makedirs(os.path.dirname(destination_file), exist_ok=True)
+                try:
+                    client.fget_object(bucket_name, obj.object_name, destination_file)
+                    print(f"Downloaded {obj.object_name} to {destination_file}")
+                except S3Error as e:
+                    print(f"Error downloading {obj.object_name}: {e}")
+            else:
+                print(f"File {destination_file} already exists. Skipping...")
+    for folder in args.folders:
+        download_folder(folder, bucket_name, client, args.destination)
+if __name__ == "__main__":
+    main()

UniCeption/scripts/install_croco_rope.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+"""
+Console script to install CroCo RoPE extension.
+"""
+import os
+import subprocess
+import sys
+from pathlib import Path
+def install_croco_rope():
+    """Install CroCo RoPE extension."""
+    try:
+        # Find the project root (where setup.py is located)
+        script_dir = Path(__file__).parent
+        project_root = script_dir.parent
+        curope_path = project_root / "uniception" / "models" / "libs" / "croco" / "curope"
+        if curope_path.exists():
+            print("Installing CroCo RoPE extension...")
+            original_cwd = os.getcwd()
+            try:
+                os.chdir(curope_path)
+                subprocess.check_call([sys.executable, "setup.py", "build_ext", "--inplace"])
+                print("CroCo RoPE extension installed successfully!")
+                return True
+            except subprocess.CalledProcessError as e:
+                print(f"Warning: Failed to install CroCo RoPE extension: {e}")
+                print("You can install it later by running:")
+                print(f"cd {curope_path} && python setup.py build_ext --inplace")
+                return False
+            finally:
+                os.chdir(original_cwd)
+        else:
+            print("Warning: CroCo RoPE source code not found.")
+            print(f"Expected location: {curope_path}")
+            return False
+    except Exception as e:
+        print(f"Warning: Error during CroCo RoPE installation: {e}")
+        return False
+def main():
+    """Main entry point for the CroCo RoPE installation script."""
+    print("UniCeption CroCo RoPE Extension Installer")
+    print("=" * 45)
+    success = install_croco_rope()
+    if success:
+        print("\n✓ CroCo RoPE extension installation completed successfully!")
+        sys.exit(0)
+    else:
+        print("\n⚠ CroCo RoPE extension installation failed or skipped.")
+        print("This is typically due to missing CUDA development tools.")
+        print("The extension is optional and UniCeption will work without it.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

UniCeption/scripts/prepare_offline_install.py ADDED Viewed

	@@ -0,0 +1,399 @@

+#!/usr/bin/env python3
+"""
+Script to prepare dependencies for offline installation.
+This script downloads all necessary wheel files for offline installation
+of UniCeption in environments without internet access.
+"""
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+def download_wheels(output_dir: Path, extras: list = None):
+    """Download wheel files for offline installation."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Create temporary requirements files
+    temp_dir = output_dir / "temp"
+    temp_dir.mkdir(exist_ok=True)
+    try:
+        # Create requirements files
+        create_requirements_files(temp_dir, extras)
+        # Download base dependencies
+        base_cmd = [
+            sys.executable,
+            "-m",
+            "pip",
+            "download",
+            "--dest",
+            str(output_dir),
+            "-r",
+            str(temp_dir / "requirements-base.txt"),
+        ]
+        print(f"Downloading base dependencies to {output_dir}...")
+        subprocess.check_call(base_cmd)
+        # Download optional dependencies if requested
+        if extras:
+            for extra in extras:
+                if extra == "all":
+                    # Download all extras
+                    for req_file in ["requirements-xformers.txt", "requirements-dev.txt"]:
+                        if (temp_dir / req_file).exists():
+                            cmd = [
+                                sys.executable,
+                                "-m",
+                                "pip",
+                                "download",
+                                "--dest",
+                                str(output_dir),
+                                "-r",
+                                str(temp_dir / req_file),
+                            ]
+                            print(
+                                f"Downloading {req_file.replace('requirements-', '').replace('.txt', '')} dependencies..."
+                            )
+                            try:
+                                subprocess.check_call(cmd)
+                            except subprocess.CalledProcessError as e:
+                                print(f"Warning: Failed to download {extra} dependencies: {e}")
+                else:
+                    req_file = temp_dir / f"requirements-{extra}.txt"
+                    if req_file.exists():
+                        cmd = [sys.executable, "-m", "pip", "download", "--dest", str(output_dir), "-r", str(req_file)]
+                        print(f"Downloading {extra} dependencies...")
+                        try:
+                            subprocess.check_call(cmd)
+                        except subprocess.CalledProcessError as e:
+                            print(f"Warning: Failed to download {extra} dependencies: {e}")
+        # Create final offline installation files
+        create_offline_installation_files(output_dir)
+        print("Download completed successfully!")
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading wheels: {e}")
+        sys.exit(1)
+    finally:
+        # Clean up temporary files
+        import shutil
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir)
+def create_requirements_files(temp_dir: Path, extras: list = None):
+    """Create temporary requirements files for downloading."""
+    # Base requirements (including PyTorch)
+    base_reqs = [
+        "numpy",
+        "torch",
+        "torchvision",
+        "torchaudio",
+        "timm",
+        "black",
+        "jaxtyping",
+        "matplotlib",
+        "Pillow",
+        "scikit-learn",
+        "einops",
+        "rerun-sdk",
+        "pre-commit",
+        "minio",
+        "pytest",
+        "isort",
+    ]
+    # Write base requirements
+    with open(temp_dir / "requirements-base.txt", "w") as f:
+        for req in base_reqs:
+            f.write(f"{req}\n")
+    # XFormers requirements
+    with open(temp_dir / "requirements-xformers.txt", "w") as f:
+        f.write("xformers\n")
+    # Dev requirements
+    dev_reqs = [
+        "black",
+        "isort",
+        "pre-commit",
+        "pytest",
+    ]
+    with open(temp_dir / "requirements-dev.txt", "w") as f:
+        for req in dev_reqs:
+            f.write(f"{req}\n")
+def create_offline_installation_files(output_dir: Path):
+    """Create requirements files and installation script for offline use."""
+    # Base requirements (including PyTorch)
+    base_reqs = [
+        "numpy",
+        "torch",
+        "torchvision",
+        "torchaudio",
+        "timm",
+        "black",
+        "jaxtyping",
+        "matplotlib",
+        "Pillow",
+        "scikit-learn",
+        "einops",
+        "rerun-sdk",
+        "pre-commit",
+        "minio",
+        "pytest",
+        "isort",
+    ]
+    # Write base requirements
+    with open(output_dir / "requirements-base.txt", "w") as f:
+        for req in base_reqs:
+            f.write(f"{req}\n")
+    # XFormers requirements
+    with open(output_dir / "requirements-xformers.txt", "w") as f:
+        f.write("xformers\n")
+    # Dev requirements
+    dev_reqs = [
+        "black",
+        "isort",
+        "pre-commit",
+        "pytest",
+    ]
+    with open(output_dir / "requirements-dev.txt", "w") as f:
+        for req in dev_reqs:
+            f.write(f"{req}\n")
+    # Create installation script
+    install_script = output_dir / "install_offline.sh"
+    with open(install_script, "w") as f:
+        f.write(
+            """#!/bin/bash
+# Offline installation script for UniCeption
+set -e
+echo "Installing UniCeption dependencies offline..."
+# Check if we're in the right directory
+if [ ! -f "requirements-base.txt" ]; then
+    echo "Error: requirements-base.txt not found. Please run this script from the offline_wheels directory."
+    exit 1
+fi
+# Install base dependencies (includes PyTorch)
+echo "Installing base dependencies (including PyTorch)..."
+pip install --no-index --find-links . -r requirements-base.txt
+# Install XFormers if requested
+if [ "$INSTALL_XFORMERS" = "true" ]; then
+    echo "Installing XFormers..."
+    pip install --no-index --find-links . -r requirements-xformers.txt
+fi
+# Install dev dependencies if requested
+if [ "$INSTALL_DEV" = "true" ]; then
+    echo "Installing development dependencies..."
+    pip install --no-index --find-links . -r requirements-dev.txt
+fi
+# Navigate back to UniCeption directory and install the package
+echo "Installing UniCeption package..."
+cd ..
+pip install --no-deps -e .
+# Install CroCo RoPE extension if requested
+if [ "$INSTALL_CROCO_ROPE" = "true" ]; then
+    echo "Installing CroCo RoPE extension..."
+    cd uniception/models/libs/croco/curope
+    python setup.py build_ext --inplace
+    cd -
+fi
+echo "Offline installation completed successfully!"
+echo ""
+echo "To verify installation, run:"
+echo "python setup.py check_deps"
+"""
+        )
+    # Make script executable
+    install_script.chmod(0o755)
+    # Create Windows batch script as well
+    install_bat = output_dir / "install_offline.bat"
+    with open(install_bat, "w") as f:
+        f.write(
+            """@echo off
+REM Offline installation script for UniCeption (Windows)
+echo Installing UniCeption dependencies offline...
+REM Check if we're in the right directory
+if not exist "requirements-base.txt" (
+    echo Error: requirements-base.txt not found. Please run this script from the offline_wheels directory.
+    exit /b 1
+)
+REM Install base dependencies (includes PyTorch)
+echo Installing base dependencies (including PyTorch)...
+pip install --no-index --find-links . -r requirements-base.txt
+REM Install XFormers if requested
+if "%INSTALL_XFORMERS%"=="true" (
+    echo Installing XFormers...
+    pip install --no-index --find-links . -r requirements-xformers.txt
+)
+REM Install dev dependencies if requested
+if "%INSTALL_DEV%"=="true" (
+    echo Installing development dependencies...
+    pip install --no-index --find-links . -r requirements-dev.txt
+)
+REM Navigate back to UniCeption directory and install the package
+echo Installing UniCeption package...
+cd ..
+pip install --no-deps -e .
+REM Install CroCo RoPE extension if requested
+if "%INSTALL_CROCO_ROPE%"=="true" (
+    echo Installing CroCo RoPE extension...
+    cd uniception\\models\\libs\\croco\\curope
+    python setup.py build_ext --inplace
+    cd ..\\..\\..\\..\\..
+)
+echo Offline installation completed successfully!
+echo.
+echo To verify installation, run:
+echo python setup.py check_deps
+"""
+        )
+    # Create a README for offline installation
+    readme_file = output_dir / "README_OFFLINE.md"
+    with open(readme_file, "w") as f:
+        f.write(
+            """# UniCeption Offline Installation
+This directory contains all the necessary files for installing UniCeption without internet access.
+## Files Included
+- `requirements-base.txt` - Core dependencies (including PyTorch)
+- `requirements-xformers.txt` - XFormers dependency
+- `requirements-dev.txt` - Development dependencies
+- `install_offline.sh` - Installation script for Unix/Linux/macOS
+- `install_offline.bat` - Installation script for Windows
+- `*.whl` files - Downloaded wheel packages
+## Installation Instructions
+### Unix/Linux/macOS
+```bash
+# Set environment variables for optional components
+export INSTALL_XFORMERS=true        # Install XFormers
+export INSTALL_DEV=true             # Install development tools
+export INSTALL_CROCO_ROPE=true      # Compile CroCo RoPE extension
+# Run the installation script
+./install_offline.sh
+```
+### Windows
+```cmd
+REM Set environment variables for optional components
+set INSTALL_XFORMERS=true
+set INSTALL_DEV=true
+set INSTALL_CROCO_ROPE=true
+REM Run the installation script
+install_offline.bat
+```
+## Manual Installation
+If the scripts don't work, you can install manually:
+```bash
+# Install base dependencies (includes PyTorch)
+pip install --no-index --find-links . -r requirements-base.txt
+# Install optional dependencies as needed
+pip install --no-index --find-links . -r requirements-xformers.txt
+pip install --no-index --find-links . -r requirements-dev.txt
+# Install UniCeption package (from parent directory)
+cd ..
+pip install --no-deps -e .
+# Compile CroCo RoPE extension (optional)
+cd uniception/models/libs/croco/curope
+python setup.py build_ext --inplace
+```
+## Verification
+After installation, verify everything is working:
+```bash
+cd ..  # Go back to UniCeption root directory
+python setup.py check_deps
+```
+## Notes
+- PyTorch, TorchVision, and TorchAudio are now included in the base requirements
+- XFormers is optional and only needed for certain performance optimizations
+- CroCo RoPE extension compilation requires a CUDA-enabled environment
+"""
+        )
+    print(f"Created offline installation files in {output_dir}")
+    print("Files created:")
+    print("  - requirements-base.txt (includes PyTorch)")
+    print("  - requirements-xformers.txt")
+    print("  - requirements-dev.txt")
+    print("  - install_offline.sh (Unix/Linux/macOS)")
+    print("  - install_offline.bat (Windows)")
+    print("  - README_OFFLINE.md")
+def create_offline_requirements(output_dir: Path):
+    """Create requirements files for offline installation."""
+    # This function is now replaced by create_offline_installation_files
+    pass
+def main():
+    parser = argparse.ArgumentParser(description="Prepare UniCeption for offline installation")
+    parser.add_argument(
+        "--output-dir", type=Path, default="offline_wheels", help="Directory to store downloaded wheels"
+    )
+    parser.add_argument("--extras", nargs="+", choices=["xformers", "dev", "all"], help="Extra dependencies to include")
+    args = parser.parse_args()
+    download_wheels(args.output_dir, args.extras)
+if __name__ == "__main__":
+    main()

UniCeption/scripts/validate_installation.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+"""
+Validation script for UniCeption installation.
+This script validates that all components of UniCeption are correctly installed
+and provides helpful diagnostics.
+"""
+import importlib
+import sys
+from pathlib import Path
+def check_package_installation():
+    """Check if UniCeption package is properly installed."""
+    try:
+        import uniception
+        print("✓ UniCeption package is installed")
+        # Check if we can import core modules
+        try:
+            from uniception.models.encoders import UniCeptionViTEncoderBase
+            print("✓ Core encoder modules are available")
+        except ImportError as e:
+            print(f"✗ Failed to import core encoder modules: {e}")
+        return True
+    except ImportError as e:
+        print(f"✗ UniCeption package not found: {e}")
+        return False
+def check_dependencies():
+    """Check optional dependencies."""
+    dependencies = {
+        "torch": "PyTorch",
+        "torchvision": "TorchVision",
+        "torchaudio": "TorchAudio",
+        "xformers": "XFormers",
+        "timm": "Timm (PyTorch Image Models)",
+        "einops": "Einops",
+        "matplotlib": "Matplotlib",
+        "numpy": "NumPy",
+        "PIL": "Pillow",
+    }
+    available = []
+    missing = []
+    for module, name in dependencies.items():
+        try:
+            mod = importlib.import_module(module)
+            version = getattr(mod, "__version__", "unknown")
+            available.append(f"✓ {name}: {version}")
+        except ImportError:
+            missing.append(f"✗ {name}: not installed")
+    print("\nDependency Status:")
+    for dep in available:
+        print(f"  {dep}")
+    if missing:
+        print("\nMissing Dependencies:")
+        for dep in missing:
+            print(f"  {dep}")
+    return len(missing) == 0
+def check_cuda_support():
+    """Check CUDA support."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            print(f"\n✓ CUDA is available")
+            print(f"  CUDA version: {torch.version.cuda}")
+            print(f"  Available devices: {torch.cuda.device_count()}")
+            for i in range(torch.cuda.device_count()):
+                print(f"    Device {i}: {torch.cuda.get_device_name(i)}")
+        else:
+            print(f"\n⚠ CUDA is not available (CPU-only mode)")
+    except ImportError:
+        print(f"\n⚠ PyTorch not installed - cannot check CUDA support")
+def check_croco_rope():
+    """Check CroCo RoPE extension."""
+    try:
+        from uniception.models.libs.croco.curope import cuRoPE2D
+        print("\n✓ CroCo RoPE extension is available")
+        return True
+    except ImportError:
+        print("\n✗ CroCo RoPE extension not available")
+        print("  To install: cd uniception/models/libs/croco/curope && python setup.py build_ext --inplace")
+        return False
+def check_model_availability():
+    """Check if models can be loaded."""
+    try:
+        # Try to check if encoder modules are available
+        from uniception.models import encoders
+        print(f"\n✓ Encoder module is available")
+        # Try to run the encoder list command
+        try:
+            import subprocess
+            result = subprocess.run(
+                [sys.executable, "-m", "uniception.models.encoders.list"], capture_output=True, text=True, timeout=10
+            )
+            if result.returncode == 0:
+                lines = result.stdout.strip().split("\n")
+                encoder_count = len([line for line in lines if line.strip() and not line.startswith("Available")])
+                print(f"✓ Available encoders: {encoder_count}")
+                return True
+            else:
+                print(f"⚠ Encoder listing returned non-zero exit code: {result.returncode}")
+                return False
+        except subprocess.TimeoutExpired:
+            print(f"⚠ Encoder listing timed out")
+            return False
+        except Exception as e:
+            print(f"⚠ Could not run encoder listing: {e}")
+            return False
+    except Exception as e:
+        print(f"\n✗ Failed to access encoder modules: {e}")
+        return False
+def check_file_structure():
+    """Check if the project file structure is correct."""
+    base_path = Path(__file__).parent.parent
+    required_dirs = [
+        "uniception",
+        "uniception/models",
+        "uniception/models/encoders",
+        "uniception/models/info_sharing",
+        "uniception/models/prediction_heads",
+        "scripts",
+        "tests",
+    ]
+    missing_dirs = []
+    for dir_path in required_dirs:
+        full_path = base_path / dir_path
+        if not full_path.exists():
+            missing_dirs.append(dir_path)
+    if missing_dirs:
+        print(f"\n✗ Missing directories:")
+        for dir_path in missing_dirs:
+            print(f"  - {dir_path}")
+        return False
+    else:
+        print(f"\n✓ Project structure is correct")
+        return True
+def main():
+    """Run all validation checks."""
+    print("UniCeption Installation Validation")
+    print("=" * 40)
+    checks = [
+        ("Package Installation", check_package_installation),
+        ("Dependencies", check_dependencies),
+        ("CUDA Support", check_cuda_support),
+        ("CroCo RoPE Extension", check_croco_rope),
+        ("Model Availability", check_model_availability),
+        ("File Structure", check_file_structure),
+    ]
+    results = []
+    for name, check_func in checks:
+        print(f"\nChecking {name}...")
+        try:
+            result = check_func()
+            results.append((name, result))
+        except Exception as e:
+            print(f"✗ Error during {name} check: {e}")
+            results.append((name, False))
+    # Summary
+    print("\n" + "=" * 40)
+    print("Validation Summary:")
+    passed = 0
+    for name, result in results:
+        status = "✓ PASS" if result else "✗ FAIL"
+        print(f"  {name}: {status}")
+        if result:
+            passed += 1
+    print(f"\nOverall: {passed}/{len(results)} checks passed")
+    if passed == len(results):
+        print("🎉 All checks passed! UniCeption is ready to use.")
+        return 0
+    else:
+        print("⚠ Some checks failed. Please review the issues above.")
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

UniCeption/setup.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Package installation setup."""
+import os
+import subprocess
+import sys
+from pathlib import Path
+from setuptools import find_packages, setup
+from setuptools.command.develop import develop
+from setuptools.command.install import install
+def install_croco_rope():
+    """Install CroCo RoPE extension."""
+    try:
+        curope_path = Path(__file__).parent / "uniception" / "models" / "libs" / "croco" / "curope"
+        if curope_path.exists():
+            print("Installing CroCo RoPE extension...")
+            original_cwd = os.getcwd()
+            try:
+                os.chdir(curope_path)
+                subprocess.check_call([sys.executable, "setup.py", "build_ext", "--inplace"])
+                print("CroCo RoPE extension installed successfully!")
+                return True
+            except subprocess.CalledProcessError as e:
+                print(f"Warning: Failed to install CroCo RoPE extension: {e}")
+                print("You can install it later by running:")
+                print(f"cd {curope_path} && python setup.py build_ext --inplace")
+                return False
+            finally:
+                os.chdir(original_cwd)
+        else:
+            print("Warning: CroCo RoPE source code not found.")
+            return False
+    except Exception as e:
+        print(f"Warning: Error during CroCo RoPE installation: {e}")
+        return False
+def check_dependencies():
+    """Check if optional dependencies are available."""
+    try:
+        import torch
+        print(f"PyTorch version: {torch.__version__}")
+        if torch.cuda.is_available():
+            print(f"CUDA available: {torch.version.cuda}")
+        else:
+            print("CUDA not available")
+    except ImportError:
+        print("PyTorch not installed")
+    try:
+        import xformers
+        print(f"XFormers version: {xformers.__version__}")
+    except ImportError:
+        print("XFormers not installed")
+    try:
+        from uniception.models.libs.croco.curope import cuRoPE2D
+        print("CroCo RoPE extension available")
+    except ImportError:
+        print("CroCo RoPE extension not available")
+class CustomDevelopCommand(develop):
+    """Custom development installation command."""
+    def run(self):
+        develop.run(self)
+        # Only install CroCo RoPE if explicitly requested
+        if os.getenv("INSTALL_CROCO_ROPE", "false").lower() in ("true", "1", "yes"):
+            install_croco_rope()
+class CustomInstallCommand(install):
+    """Custom installation command."""
+    def run(self):
+        install.run(self)
+        # Only install CroCo RoPE if explicitly requested
+        if os.getenv("INSTALL_CROCO_ROPE", "false").lower() in ("true", "1", "yes"):
+            install_croco_rope()
+class CrocoInstallCommand(install):
+    """Install command that includes CroCo RoPE extension."""
+    def run(self):
+        install.run(self)
+        install_croco_rope()
+class CheckDependenciesCommand(install):
+    """Command to check available dependencies."""
+    def run(self):
+        check_dependencies()
+# Core dependencies (including PyTorch which is essential for UniCeption)
+install_requires = [
+    "numpy",
+    "torch",
+    "torchvision",
+    "torchaudio",
+    "timm",
+    "black",
+    "jaxtyping",
+    "matplotlib",
+    "Pillow",
+    "scikit-learn",
+    "einops",
+    "rerun-sdk",
+    "pre-commit",
+    "minio",
+    "pytest",
+    "isort",
+]
+# Optional dependencies
+extras_require = {
+    "xformers": [
+        "xformers",  # Will be installed from PyTorch wheel index
+    ],
+    "dev": [
+        "black",
+        "isort",
+        "pre-commit",
+        "pytest",
+    ],
+    "minimal": [
+        # Minimal dependencies for basic functionality without PyTorch
+        "numpy",
+        "matplotlib",
+        "Pillow",
+        "scikit-learn",
+        "einops",
+    ],
+}
+# All optional dependencies combined (excluding minimal since it's subset of install_requires)
+extras_require["all"] = list(set(extras_require["xformers"] + extras_require["dev"]))
+setup(
+    name="uniception",
+    version="0.1.0",
+    description="Generalizable Perception Stack for 3D, 4D, spatial AI and scene understanding",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    author="AirLab",
+    license="BSD Clause-3",
+    packages=find_packages(),
+    package_dir={"": "."},
+    include_package_data=True,
+    python_requires=">=3.10",
+    install_requires=install_requires,
+    extras_require=extras_require,
+    cmdclass={
+        "develop": CustomDevelopCommand,
+        "install": CustomInstallCommand,
+        "install_croco": CrocoInstallCommand,
+        "check_deps": CheckDependenciesCommand,
+    },
+    entry_points={
+        "console_scripts": [
+            "uniception-download-checkpoints=scripts.download_checkpoints:main",
+            "uniception-validate=scripts.validate_installation:main",
+            "uniception-prepare-offline=scripts.prepare_offline_install:main",
+            "uniception-check-deps=scripts.check_dependencies:main",
+            "uniception-install-croco=scripts.install_croco_rope:main",
+        ],
+    },
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    keywords="computer-vision, 3d-vision, spatial-ai, perception, deep-learning, pytorch",
+)

UniCeption/tests/models/encoders/conftest.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pytest
+def pytest_addoption(parser):
+    # Add custom command-line options
+    parser.addoption("--encoder-name", action="store", default=None, help="Specify the encoder name to test")
+    parser.addoption(
+        "--device",
+        action="store",
+        default="cpu",
+        choices=["cpu", "gpu"],
+        help="Specify the device to use (default: cpu)",
+    )
+@pytest.fixture
+def encoder_name(request):
+    # Access the value of the custom option for encoder name
+    return request.config.getoption("--encoder-name")
+@pytest.fixture
+def device(request):
+    # Access the value of the custom option for device
+    return request.config.getoption("--device")

UniCeption/tests/models/encoders/test_encoders.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import random
+from functools import lru_cache
+from typing import Tuple
+import numpy as np
+import pytest
+import requests
+import torch
+from PIL import Image
+from uniception.models.encoders import *
+from uniception.models.encoders.image_normalizations import *
+@pytest.fixture(scope="module")
+def norm_types():
+    return IMAGE_NORMALIZATION_DICT.keys()
+@pytest.fixture(scope="module")
+def encoders():
+    return [
+        "croco",
+        "dust3r_224",
+        "dust3r_512",
+        "dust3r_512_dpt",
+        "mast3r_512",
+        "dinov2_base",
+        "dinov2_large",
+        "dinov2_large_reg",
+        "dinov2_large_dav2",
+        "dinov2_giant",
+        "dinov2_giant_reg",
+        "radio_v2.5-b",
+        "radio_v2.5-l",
+        "e-radio_v2",
+        "naradio_v2.5-b",
+        "naradio_v2.5-l",
+        "cosmosx8",
+        "patch_embedder",
+    ]
+@pytest.fixture(scope="module")
+def encoder_configs(encoders):
+    # Adjust the number of configs to match the number of encoders
+    return [{}] * len(encoders)
+@pytest.fixture
+def device(request):
+    # Access the value of the custom option for device
+    device_str = request.config.getoption("--device")
+    if device_str == "gpu" and torch.cuda.is_available():
+        device = torch.device("cuda")  # Use the default CUDA device
+    else:
+        device = torch.device("cpu")
+    print(f"Using device: {device.type.upper()}")
+    return device
+@pytest.fixture
+def example_input(device):
+    @lru_cache(maxsize=3)
+    def _get_example_input(
+        image_size: Tuple[int, int],
+        image_norm_type: str = "dummy",
+        img_selection: int = 1,
+        return_viz_img: bool = False,
+    ) -> torch.Tensor:
+        url = f"https://raw.githubusercontent.com/naver/croco/d3d0ab2858d44bcad54e5bfc24f565983fbe18d9/assets/Chateau{img_selection}.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+        image = image.resize(image_size)
+        image = image.convert("RGB")
+        img = torch.from_numpy(np.array(image))
+        viz_img = img.clone()
+        # Normalize the image
+        image_normalization = IMAGE_NORMALIZATION_DICT[image_norm_type]
+        img_mean = image_normalization.mean
+        img_std = image_normalization.std
+        img = (img.float() / 255.0 - img_mean) / img_std
+        # Convert to BCHW format
+        img = img.permute(2, 0, 1).unsqueeze(0).to(device)
+        if return_viz_img:
+            return img, viz_img
+        else:
+            return img
+    return _get_example_input
+def inference_encoder(encoder, encoder_input):
+    # Encoder expects a ViTEncoderInput object
+    return encoder(encoder_input).features
+def test_make_dummy_encoder(device):
+    print(f"Testing Init of Dummy Encoder on {device.type.upper()}")
+    encoder = _make_encoder_test("dummy").to(device)
+    # Check if the encoder has parameters
+    try:
+        params = list(encoder.parameters())
+        if not params:
+            print("Warning: The encoder has no parameters.")
+        else:
+            # Verify if the model is on the right device
+            assert params[0].is_cuda == (device.type == "cuda")
+    except Exception as e:
+        print(f"Error: {e}")
+        assert False  # Fail the test if any error occurs
+    assert encoder is not None
+def test_all_encoder_basics(encoders, encoder_configs, norm_types, example_input, encoder_name, device):
+    if encoder_name:
+        encoders = [encoder_name]  # Override default encoders with the one specified
+    for encoder_name, encoder_config in zip(encoders, encoder_configs):
+        print(f"Testing encoder: {encoder_name} on {device.type.upper()}")
+        encoder = _make_encoder_test(encoder_name, **encoder_config).to(device)
+        _check_baseclass_attribute(encoder, norm_types)
+        _check_norm_check_function(encoder)
+        if isinstance(encoder, UniCeptionViTEncoderBase):
+            _check_vit_encoder_attribute(encoder)
+            _test_vit_encoder_patch_size(encoder, example_input)
+def _check_baseclass_attribute(encoder, norm_types):
+    assert hasattr(encoder, "name")
+    assert hasattr(encoder, "size")
+    assert hasattr(encoder, "data_norm_type")
+    assert isinstance(encoder.name, str)
+    assert isinstance(encoder.size, str) or encoder.size is None
+    assert isinstance(encoder.data_norm_type, str)
+    # Check if the data_norm_type is in the list of normalization types
+    assert encoder.data_norm_type in norm_types
+def _check_norm_check_function(encoder):
+    assert hasattr(encoder, "_check_data_normalization_type")
+    encoder_notm_type = encoder.data_norm_type
+    try:
+        encoder._check_data_normalization_type(encoder_notm_type)
+    except AssertionError:
+        assert False
+    try:
+        encoder._check_data_normalization_type("some_nonexistent_norm_type")
+        assert False
+    except AssertionError:
+        pass
+def _check_vit_encoder_attribute(encoder):
+    assert hasattr(encoder, "patch_size")
+    assert isinstance(encoder.patch_size, int)
+    assert encoder.patch_size > 0
+def _test_vit_encoder_patch_size(encoder, example_input):
+    print(f"Testing {encoder.name} inference")
+    image_size = (14 * encoder.patch_size, 14 * encoder.patch_size)
+    img = example_input(image_size, encoder.data_norm_type)
+    # Create an instance of ViTEncoderInput with correct attributes
+    encoder_input = ViTEncoderInput(
+        data_norm_type=encoder.data_norm_type,
+        image=img,
+    )
+    encoder_output = inference_encoder(encoder, encoder_input)
+    assert isinstance(encoder_output, torch.Tensor)
+    assert encoder_output.shape[2] == 14
+    assert encoder_output.shape[3] == 14
+@pytest.fixture(scope="session", autouse=True)
+def seed_everything():
+    seed = 42
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"Seed set to: {seed} (type: {type(seed)})")
+    # Turn XFormers off for testing on CPU
+    os.environ["XFORMERS_DISABLED"] = "1"

UniCeption/tests/models/encoders/viz_image_encoders.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+PCA Visualization of UniCeption Image Encoders
+"""
+import os
+import random
+from functools import lru_cache
+from typing import Tuple
+import numpy as np
+import requests
+import torch
+import torch.nn.functional as F
+from matplotlib import pyplot as plt
+from PIL import Image
+from sklearn.decomposition import PCA
+from uniception.models.encoders import *
+from uniception.models.encoders.image_normalizations import *
+class TestEncoders:
+    def __init__(self, pca_save_folder, *args, **kwargs):
+        super(TestEncoders, self).__init__(*args, **kwargs)
+        self.pca_save_folder = pca_save_folder
+        self.norm_types = IMAGE_NORMALIZATION_DICT.keys()
+        self.encoders = [
+            "croco",
+            "dust3r_224",
+            "dust3r_512",
+            "dust3r_512_dpt",
+            "mast3r_512",
+            "dinov2_large",
+            "dinov2_large_reg",
+            "dinov2_large_dav2",
+            "dinov2_giant",
+            "dinov2_giant_reg",
+            "radio_v2.5-b",
+            "radio_v2.5-l",
+            "e-radio_v2",
+        ]
+        self.encoder_configs = [{}] * len(self.encoders)
+    def inference_encoder(self, encoder, input):
+        return encoder(input)
+    def visualize_all_encoders(self):
+        for encoder, encoder_config in zip(self.encoders, self.encoder_configs):
+            encoder = _make_encoder_test(encoder, **encoder_config)
+            self._visualize_encoder_features_consistency(encoder, (224, 224))
+    def _visualize_encoder_features(self, encoder, image_size: Tuple[int, int]):
+        img, viz_img = self._get_example_input(image_size, encoder.data_norm_type, return_viz_img=True)
+        # input and output of the encoder
+        encoder_input: ViTEncoderInput = ViTEncoderInput(
+            data_norm_type=encoder.data_norm_type,
+            image=img,
+        )
+        encoder_output = self.inference_encoder(encoder, encoder_input)
+        encoder_output = encoder_output.features
+        self.assertTrue(isinstance(encoder_output, torch.Tensor))
+        # visualize the features
+        pca_viz = get_pca_map(encoder_output.permute(0, 2, 3, 1), image_size, return_pca_stats=False)
+        # plot the input image and the PCA features
+        fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+        axs[0].imshow(viz_img)
+        axs[0].set_title("Input Image")
+        axs[0].axis("off")
+        axs[1].imshow(pca_viz)
+        axs[1].set_title(f"PCA Features of {encoder.name}")
+        axs[1].axis("off")
+        plt.savefig(f"{self.pca_save_folder}/pca_{encoder.name}.png", bbox_inches="tight")
+        plt.close()
+    def _visualize_encoder_features_consistency(self, encoder, image_size: Tuple[int, int]):
+        img0, viz_img0 = self._get_example_input(
+            image_size, encoder.data_norm_type, img_selection=1, return_viz_img=True
+        )
+        img1, viz_img1 = self._get_example_input(
+            image_size, encoder.data_norm_type, img_selection=2, return_viz_img=True
+        )
+        # input and output of the encoder
+        encoder_input0: ViTEncoderInput = ViTEncoderInput(
+            data_norm_type=encoder.data_norm_type,
+            image=img0,
+        )
+        encoder_input1: ViTEncoderInput = ViTEncoderInput(
+            data_norm_type=encoder.data_norm_type,
+            image=img1,
+        )
+        encoder_output0 = self.inference_encoder(encoder, encoder_input0)
+        encoder_output0 = encoder_output0.features
+        encoder_output1 = self.inference_encoder(encoder, encoder_input1)
+        encoder_output1 = encoder_output1.features
+        # get a common PCA codec
+        cat_feats = torch.cat([encoder_output0, encoder_output1], dim=3)
+        pca_viz = get_pca_map(cat_feats.permute(0, 2, 3, 1), (image_size[0], image_size[1] * 2), return_pca_stats=True)
+        # concatenate the input images along the width dimension
+        cat_imgs = torch.cat([viz_img0, viz_img1], dim=1)
+        # plot the input image and the PCA features
+        fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+        axs[0].imshow(cat_imgs)
+        axs[0].set_title("Input Images")
+        axs[0].axis("off")
+        axs[1].imshow(pca_viz[0])
+        axs[1].set_title(f"PCA Features of {encoder.name}")
+        axs[1].axis("off")
+        plt.savefig(f"{self.pca_save_folder}/multi_pca_{encoder.name}.png", bbox_inches="tight")
+        plt.close()
+    @lru_cache(maxsize=3)
+    def _get_example_input(
+        self,
+        image_size: Tuple[int, int],
+        image_norm_type: str = "dummy",
+        img_selection: int = 1,
+        return_viz_img: bool = False,
+    ) -> torch.Tensor:
+        url = f"https://raw.githubusercontent.com/naver/croco/d3d0ab2858d44bcad54e5bfc24f565983fbe18d9/assets/Chateau{img_selection}.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+        image = image.resize(image_size)
+        image = image.convert("RGB")
+        img = torch.from_numpy(np.array(image))
+        viz_img = img.clone()
+        # Normalize the images
+        image_normalization = IMAGE_NORMALIZATION_DICT[image_norm_type]
+        img_mean, img_std = image_normalization.mean, image_normalization.std
+        img = (img.float() / 255.0 - img_mean) / img_std
+        # convert to BCHW format
+        img = img.permute(2, 0, 1).unsqueeze(0)
+        if return_viz_img:
+            return img, viz_img
+        else:
+            return img
+def render_pca_as_rgb(features):
+    """
+    Perform PCA on the given feature tensor and render the first 3 principal components as RGB.
+    Args:
+        features (torch.Tensor): Feature tensor of shape (B, C, H, W).
+    Returns:
+        np.ndarray: RGB image of shape (H, W, 3).
+    """
+    # Ensure input is a 4D tensor
+    assert features.dim() == 4, "Input tensor must be 4D (B, C, H, W)"
+    B, C, H, W = features.shape
+    # Reshape the tensor to (B * H * W, C)
+    reshaped_features = features.permute(0, 2, 3, 1).contiguous().view(-1, C).cpu().numpy()
+    # Perform PCA
+    pca = PCA(n_components=3)
+    principal_components = pca.fit_transform(reshaped_features)
+    # Rescale the principal components to [0, 1]
+    principal_components = (principal_components - principal_components.min(axis=0)) / (
+        principal_components.max(axis=0) - principal_components.min(axis=0)
+    )
+    # Reshape the principal components to (B, H, W, 3)
+    principal_components = principal_components.reshape(B, H, W, 3)
+    # Convert the principal components to RGB image (take the first batch)
+    rgb_image = principal_components[0]
+    return rgb_image
+def get_robust_pca(features: torch.Tensor, m: float = 2, remove_first_component=False):
+    # features: (N, C)
+    # m: a hyperparam controlling how many std dev outside for outliers
+    assert len(features.shape) == 2, "features should be (N, C)"
+    reduction_mat = torch.pca_lowrank(features, q=3, niter=20)[2]
+    colors = features @ reduction_mat
+    if remove_first_component:
+        colors_min = colors.min(dim=0).values
+        colors_max = colors.max(dim=0).values
+        tmp_colors = (colors - colors_min) / (colors_max - colors_min)
+        fg_mask = tmp_colors[..., 0] < 0.2
+        reduction_mat = torch.pca_lowrank(features[fg_mask], q=3, niter=20)[2]
+        colors = features @ reduction_mat
+    else:
+        fg_mask = torch.ones_like(colors[:, 0]).bool()
+    d = torch.abs(colors[fg_mask] - torch.median(colors[fg_mask], dim=0).values)
+    mdev = torch.median(d, dim=0).values
+    s = d / mdev
+    try:
+        rins = colors[fg_mask][s[:, 0] < m, 0]
+        gins = colors[fg_mask][s[:, 1] < m, 1]
+        bins = colors[fg_mask][s[:, 2] < m, 2]
+        rgb_min = torch.tensor([rins.min(), gins.min(), bins.min()])
+        rgb_max = torch.tensor([rins.max(), gins.max(), bins.max()])
+    except:
+        rins = colors
+        gins = colors
+        bins = colors
+        rgb_min = torch.tensor([rins.min(), gins.min(), bins.min()])
+        rgb_max = torch.tensor([rins.max(), gins.max(), bins.max()])
+    return reduction_mat, rgb_min.to(reduction_mat), rgb_max.to(reduction_mat)
+def get_pca_map(
+    feature_map: torch.Tensor,
+    img_size,
+    interpolation="bicubic",
+    return_pca_stats=False,
+    pca_stats=None,
+):
+    """
+    feature_map: (1, h, w, C) is the feature map of a single image.
+    """
+    if feature_map.shape[0] != 1:
+        # make it (1, h, w, C)
+        feature_map = feature_map[None]
+    if pca_stats is None:
+        reduct_mat, color_min, color_max = get_robust_pca(feature_map.reshape(-1, feature_map.shape[-1]))
+    else:
+        reduct_mat, color_min, color_max = pca_stats
+    pca_color = feature_map @ reduct_mat
+    pca_color = (pca_color - color_min) / (color_max - color_min)
+    pca_color = pca_color.clamp(0, 1)
+    pca_color = F.interpolate(
+        pca_color.permute(0, 3, 1, 2),
+        size=img_size,
+        mode=interpolation,
+    ).permute(0, 2, 3, 1)
+    pca_color = pca_color.detach().cpu().numpy().squeeze(0)
+    if return_pca_stats:
+        return pca_color, (reduct_mat, color_min, color_max)
+    return pca_color
+def seed_everything(seed=42):
+    """
+    Set the `seed` value for torch and numpy seeds. Also turns on
+    deterministic execution for cudnn.
+    Parameters:
+    - seed:     A hashable seed value
+    """
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"Seed set to: {seed} (type: {type(seed)})")
+if __name__ == "__main__":
+    # Turn XFormers off for testing on CPU
+    os.environ["XFORMERS_DISABLED"] = "1"
+    # Seed everything for consistent testing
+    seed_everything()
+    # Create local directory for storing the PCA images
+    current_file_path = os.path.abspath(__file__)
+    relative_pca_image_folder = os.path.join(os.path.dirname(current_file_path), "../../../local/encoders/pca_images")
+    os.makedirs(relative_pca_image_folder, exist_ok=True)
+    # Initialize the test class
+    test = TestEncoders(pca_save_folder=relative_pca_image_folder)
+    # Visualize the PCA of all encoders
+    test.visualize_all_encoders()
+    print(f"The PCA visualizations of all encoders are saved successfully to {relative_pca_image_folder}!")

UniCeption/tests/models/info_sharing/viz_mulit_view_cross_attn_transformers.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+PCA Visualization of UniCeption Image Encoders + Multi-View Cross Attention Transformers
+"""
+import os
+import random
+from functools import lru_cache
+from typing import Tuple
+import numpy as np
+import requests
+import torch
+import torch.nn.functional as F
+from matplotlib import pyplot as plt
+from PIL import Image
+from sklearn.decomposition import PCA
+from uniception.models.encoders import *
+from uniception.models.encoders.image_normalizations import *
+from uniception.models.info_sharing.base import MultiViewTransformerInput
+from uniception.models.info_sharing.cross_attention_transformer import MultiViewCrossAttentionTransformerIFR
+from uniception.models.libs.croco.pos_embed import RoPE2D, get_2d_sincos_pos_embed
+def _make_mv_cross_attention_transformer_test(model_str: str, **kwargs):
+    current_file_path = os.path.abspath(__file__)
+    relative_checkpoint_path = os.path.join(
+        os.path.dirname(current_file_path), "../../../checkpoints/info_sharing/cross_attn_transformer"
+    )
+    rope = RoPE2D(float(100))
+    if model_str == "croco":
+        return MultiViewCrossAttentionTransformerIFR(
+            name="croco_base_decoder",
+            input_embed_dim=1024,
+            num_views=2,
+            indices=[12 * 2 // 4, 12 * 3 // 4],
+            norm_intermediate=False,
+            custom_positional_encoding=rope,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Two_View_Cross_Attention_Transformer_CroCo.pth",
+            **kwargs,
+        )
+    elif model_str == "dust3r_224":
+        return MultiViewCrossAttentionTransformerIFR(
+            name="dust3r_224_base_decoder",
+            input_embed_dim=1024,
+            num_views=2,
+            indices=[12 * 2 // 4, 12 * 3 // 4],
+            norm_intermediate=False,
+            custom_positional_encoding=rope,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Two_View_Cross_Attention_Transformer_DUSt3R_224_linear.pth",
+            **kwargs,
+        )
+    elif model_str == "dust3r_512":
+        return MultiViewCrossAttentionTransformerIFR(
+            name="dust3r_512_base_decoder",
+            input_embed_dim=1024,
+            num_views=2,
+            indices=[12 * 2 // 4, 12 * 3 // 4],
+            norm_intermediate=False,
+            custom_positional_encoding=rope,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Two_View_Cross_Attention_Transformer_DUSt3R_512_linear.pth",
+            **kwargs,
+        )
+    elif model_str == "dust3r_512_dpt":
+        return MultiViewCrossAttentionTransformerIFR(
+            name="dust3r_512_dpt_base_decoder",
+            input_embed_dim=1024,
+            num_views=2,
+            indices=[12 * 2 // 4, 12 * 3 // 4],
+            norm_intermediate=False,
+            custom_positional_encoding=rope,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Two_View_Cross_Attention_Transformer_DUSt3R_512_dpt.pth",
+            **kwargs,
+        )
+    elif model_str == "mast3r_512":
+        return MultiViewCrossAttentionTransformerIFR(
+            name="mast3r_512_base_decoder",
+            input_embed_dim=1024,
+            num_views=2,
+            indices=[12 * 2 // 4, 12 * 3 // 4],
+            norm_intermediate=False,
+            custom_positional_encoding=rope,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Two_View_Cross_Attention_Transformer_MASt3R.pth",
+            **kwargs,
+        )
+class TestMultiViewTransformers:
+    def __init__(self, pca_save_folder, *args, **kwargs):
+        super(TestMultiViewTransformers, self).__init__(*args, **kwargs)
+        self.pca_save_folder = pca_save_folder
+        self.norm_types = IMAGE_NORMALIZATION_DICT.keys()
+        self.models = [
+            "croco",
+            "dust3r_224",
+            "dust3r_512",
+            "dust3r_512_dpt",
+            "mast3r_512",
+        ]
+        self.model_configs = [{}] * len(self.models)
+    def inference_encoder(self, encoder, input):
+        return encoder(input)
+    def inference_info_sharing(self, info_sharing, input):
+        return info_sharing(input)
+    def visualize_all_models(self):
+        for model, model_config in zip(self.models, self.model_configs):
+            encoder = _make_encoder_test(model, **model_config)
+            info_sharing = _make_mv_cross_attention_transformer_test(model, **model_config)
+            self._visualize_model_features_consistency(encoder, info_sharing, (224, 224))
+    def _visualize_model_features_consistency(self, encoder, info_sharing, image_size: Tuple[int, int]):
+        img0, viz_img0 = self._get_example_input(
+            image_size, encoder.data_norm_type, img_selection=1, return_viz_img=True
+        )
+        img1, viz_img1 = self._get_example_input(
+            image_size, encoder.data_norm_type, img_selection=2, return_viz_img=True
+        )
+        # input and output of the encoder
+        encoder_input0: ViTEncoderInput = ViTEncoderInput(
+            data_norm_type=encoder.data_norm_type,
+            image=img0,
+        )
+        encoder_input1: ViTEncoderInput = ViTEncoderInput(
+            data_norm_type=encoder.data_norm_type,
+            image=img1,
+        )
+        encoder_output0 = self.inference_encoder(encoder, encoder_input0)
+        encoder_output0 = encoder_output0.features
+        encoder_output1 = self.inference_encoder(encoder, encoder_input1)
+        encoder_output1 = encoder_output1.features
+        # pass the encoder outputs to the info sharing model
+        multi_view_features = [encoder_output0, encoder_output1]
+        info_sharing_input = MultiViewTransformerInput(features=multi_view_features)
+        info_sharing_output = self.inference_info_sharing(info_sharing, info_sharing_input)
+        final_layer_multi_view_features = info_sharing_output[0].features
+        # get a common PCA codec
+        cat_feats = torch.cat(final_layer_multi_view_features, dim=3)
+        pca_viz = get_pca_map(cat_feats.permute(0, 2, 3, 1), (image_size[0], image_size[1] * 2), return_pca_stats=True)
+        # concatenate the input images along the width dimension
+        cat_imgs = torch.cat([viz_img0, viz_img1], dim=1)
+        # plot the input image and the PCA features
+        fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+        axs[0].imshow(cat_imgs)
+        axs[0].set_title("Input Images")
+        axs[0].axis("off")
+        axs[1].imshow(pca_viz[0])
+        axs[1].set_title(f"PCA Features of {encoder.name} + Base Decoder")
+        axs[1].axis("off")
+        plt.savefig(f"{self.pca_save_folder}/multi_pca_{encoder.name}.png", bbox_inches="tight")
+        plt.close()
+    @lru_cache(maxsize=3)
+    def _get_example_input(
+        self,
+        image_size: Tuple[int, int],
+        image_norm_type: str = "dummy",
+        img_selection: int = 1,
+        return_viz_img: bool = False,
+    ) -> torch.Tensor:
+        url = f"https://raw.githubusercontent.com/naver/croco/d3d0ab2858d44bcad54e5bfc24f565983fbe18d9/assets/Chateau{img_selection}.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+        image = image.resize(image_size)
+        image = image.convert("RGB")
+        img = torch.from_numpy(np.array(image))
+        viz_img = img.clone()
+        # Normalize the images
+        image_normalization = IMAGE_NORMALIZATION_DICT[image_norm_type]
+        img_mean, img_std = image_normalization.mean, image_normalization.std
+        img = (img.float() / 255.0 - img_mean) / img_std
+        # convert to BCHW format
+        img = img.permute(2, 0, 1).unsqueeze(0)
+        if return_viz_img:
+            return img, viz_img
+        else:
+            return img
+def render_pca_as_rgb(features):
+    """
+    Perform PCA on the given feature tensor and render the first 3 principal components as RGB.
+    Args:
+        features (torch.Tensor): Feature tensor of shape (B, C, H, W).
+    Returns:
+        np.ndarray: RGB image of shape (H, W, 3).
+    """
+    # Ensure input is a 4D tensor
+    assert features.dim() == 4, "Input tensor must be 4D (B, C, H, W)"
+    B, C, H, W = features.shape
+    # Reshape the tensor to (B * H * W, C)
+    reshaped_features = features.permute(0, 2, 3, 1).contiguous().view(-1, C).cpu().numpy()
+    # Perform PCA
+    pca = PCA(n_components=3)
+    principal_components = pca.fit_transform(reshaped_features)
+    # Rescale the principal components to [0, 1]
+    principal_components = (principal_components - principal_components.min(axis=0)) / (
+        principal_components.max(axis=0) - principal_components.min(axis=0)
+    )
+    # Reshape the principal components to (B, H, W, 3)
+    principal_components = principal_components.reshape(B, H, W, 3)
+    # Convert the principal components to RGB image (take the first batch)
+    rgb_image = principal_components[0]
+    return rgb_image
+def get_robust_pca(features: torch.Tensor, m: float = 2, remove_first_component=False):
+    # features: (N, C)
+    # m: a hyperparam controlling how many std dev outside for outliers
+    assert len(features.shape) == 2, "features should be (N, C)"
+    reduction_mat = torch.pca_lowrank(features, q=3, niter=20)[2]
+    colors = features @ reduction_mat
+    if remove_first_component:
+        colors_min = colors.min(dim=0).values
+        colors_max = colors.max(dim=0).values
+        tmp_colors = (colors - colors_min) / (colors_max - colors_min)
+        fg_mask = tmp_colors[..., 0] < 0.2
+        reduction_mat = torch.pca_lowrank(features[fg_mask], q=3, niter=20)[2]
+        colors = features @ reduction_mat
+    else:
+        fg_mask = torch.ones_like(colors[:, 0]).bool()
+    d = torch.abs(colors[fg_mask] - torch.median(colors[fg_mask], dim=0).values)
+    mdev = torch.median(d, dim=0).values
+    s = d / mdev
+    try:
+        rins = colors[fg_mask][s[:, 0] < m, 0]
+        gins = colors[fg_mask][s[:, 1] < m, 1]
+        bins = colors[fg_mask][s[:, 2] < m, 2]
+        rgb_min = torch.tensor([rins.min(), gins.min(), bins.min()])
+        rgb_max = torch.tensor([rins.max(), gins.max(), bins.max()])
+    except:
+        rins = colors
+        gins = colors
+        bins = colors
+        rgb_min = torch.tensor([rins.min(), gins.min(), bins.min()])
+        rgb_max = torch.tensor([rins.max(), gins.max(), bins.max()])
+    return reduction_mat, rgb_min.to(reduction_mat), rgb_max.to(reduction_mat)
+def get_pca_map(
+    feature_map: torch.Tensor,
+    img_size,
+    interpolation="bicubic",
+    return_pca_stats=False,
+    pca_stats=None,
+):
+    """
+    feature_map: (1, h, w, C) is the feature map of a single image.
+    """
+    if feature_map.shape[0] != 1:
+        # make it (1, h, w, C)
+        feature_map = feature_map[None]
+    if pca_stats is None:
+        reduct_mat, color_min, color_max = get_robust_pca(feature_map.reshape(-1, feature_map.shape[-1]))
+    else:
+        reduct_mat, color_min, color_max = pca_stats
+    pca_color = feature_map @ reduct_mat
+    pca_color = (pca_color - color_min) / (color_max - color_min)
+    pca_color = pca_color.clamp(0, 1)
+    pca_color = F.interpolate(
+        pca_color.permute(0, 3, 1, 2),
+        size=img_size,
+        mode=interpolation,
+    ).permute(0, 2, 3, 1)
+    pca_color = pca_color.detach().cpu().numpy().squeeze(0)
+    if return_pca_stats:
+        return pca_color, (reduct_mat, color_min, color_max)
+    return pca_color
+def seed_everything(seed=42):
+    """
+    Set the `seed` value for torch and numpy seeds. Also turns on
+    deterministic execution for cudnn.
+    Parameters:
+    - seed:     A hashable seed value
+    """
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"Seed set to: {seed} (type: {type(seed)})")
+if __name__ == "__main__":
+    # Turn XFormers off for testing on CPU
+    os.environ["XFORMERS_DISABLED"] = "1"
+    # Seed everything for consistent testing
+    seed_everything()
+    # Create local directory for storing the PCA images
+    current_file_path = os.path.abspath(__file__)
+    relative_pca_image_folder = os.path.join(
+        os.path.dirname(current_file_path), "../../../local/info_sharing/pca_images"
+    )
+    os.makedirs(relative_pca_image_folder, exist_ok=True)
+    # Initialize the test class
+    test = TestMultiViewTransformers(pca_save_folder=relative_pca_image_folder)
+    # Visualize the PCA of all models
+    test.visualize_all_models()
+    print(f"The PCA visualizations of all models are saved successfully to {relative_pca_image_folder}!")

UniCeption/uniception/__init__.py ADDED Viewed

File without changes

UniCeption/uniception/models/encoders/README.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# UniCeption Encoders
+## Currently Supported Encoders
+### UniCeptionViTEncoderBase:
+- `CroCoEncoder`
+   - `CroCoIntermediateFeatureReturner`
+- `DINOv2Encoder`
+   - `DINOv2IntermediateFeatureReturner`
+- `PatchEmbedder`
+- `RADIOEncoder`
+   - `RADIOIntermediateFeatureReturner`
+# Developer Guidelines for UniCeption Encoders
+## Overview
+This folder contains the implementation of various UniCeption encoders. Each encoder must adhere to a specific structure and follow certain guidelines to ensure consistency and compatibility across different projects.
+## Directory Structure
+The encoders and other necessary dependencies/tests for encoders are organized as follows:
+```
+uniception/
+├── models/
+│   ├── encoders/
+│   │   ├── __init__.py
+│   │   ├── base.py
+│   │   ├── croco.py
+│   │   ├── dinov2.py
+│   │   ├── radio.py
+│   │   ├── image_normalizations.py
+│   └── ...
+│   └── libs/
+│   │   ├── external_dependency_folders/
+|   |   |   ├── external_dependency_files
+tests/
+├── models/
+│   ├── encoders/
+│   │   ├── test_encoders.py
+│   │   ├── viz_image_encoders.py
+│   │   └── ...
+|   └── ...
+└── ...
+```
+## Adding a New Encoder
+To add a new encoder, follow these steps:
+1. **Create a New Encoder File**:
+   - Create a new file in the `encoders` directory, e.g., `new_encoder.py`.
+   - Define the new encoder class in this file, inheriting from `UniCeptionEncoderBase` or `UniCeptionViTEncoderBase`.
+   - Please look at the base class for the necessary attributes and methods to implement.
+2. **Define Input Data Normalization**:
+   - Add the corresponding normalization for the encoder to respective normalization files, for example, image normalizations should be added to `image_normalizations.py`.
+   - Ensure the normalization is added to the dictionaries present in the files, for example, `IMAGE_NORMALIZATION_DICT`.
+4. **Implement the Encoder Class**:
+   - Inherit from `UniCeptionEncoderBase` or `UniCeptionViTEncoderBase` or other UniCeption base classes.
+   - Implement the `forward` method.
+   - Ensure the encoder class has the necessary attributes and methods.
+4. **Update `__init__.py`**:
+   - Import the new encoder class in `__init__.py`.
+   - Add the new encoder to the encoder configuration dictionary `ENCODER_CONFIGS` so that it can be instantiated via the encoder factory.
+   - Update the `_make_encoder_test` function to include the new encoder.
+5. **Run Encoder Unit Tests**:
+   - Run `pytest -vs tests/models/encoders/test_encoders.py --encoder-name="<new_encoder>"` to test the basic expected functionality of UniCeption encoders.
+   - Also, add your new encoder to the list in the encoders() in `tests/models/encoders/test_encoders.py` so that it can be tested along with all the existing encoders.
+   - Optionally, for image encoders, the unit tests in `tests/models/encoders/viz_image_encoders.py` save PCA visualizations of the encoder outputs to the `local/pca_images` directory.
+## Example Encoder Implementation
+Here is an example of how to implement a new encoder:
+```python
+# new_encoder.py
+import torch
+from uniception.models.encoders.base import UniCeptionEncoderBase, EncoderInput, EncoderOutput
+class NewEncoder(UniCeptionEncoderBase):
+    def __init__(self, name: str, data_norm_type: str, *args, **kwargs):
+        super().__init__(name=name, data_norm_type=data_norm_type, *args, **kwargs)
+        # Initialize encoder-specific layers and parameters here
+    def forward(self, encoder_input: EncoderInput) -> EncoderOutput:
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Implement the forward pass
+        return EncoderOutput()
+```
+## Example Normalization
+Add the normalization for the new encoder, for example, to `image_normalizations.py`:
+```python
+# image_normalizations.py
+IMAGE_NORMALIZATION_DICT = {
+    "dummy": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0])),
+    "croco": ImageNormalization(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])),
+    "dust3r": ImageNormalization(mean=torch.tensor([0.5, 0.5, 0.5]), std=torch.tensor([0.5, 0.5, 0.5])),
+    "dinov2": ImageNormalization(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])),
+    "radio": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0])),
+    "new_encoder": ImageNormalization(mean=torch.tensor([0.5, 0.5, 0.5]), std=torch.tensor([0.2, 0.2, 0.2])),
+}
+```
+## Example Unit Testing
+Add the new encoder to the encoder factory in `__init__.py` and the encoder list in `tests/models/encoders/test_encoders.py`. Additional tests can also be added as required.
+Look at `tests/models/encoders/test_encoders.py` to see what tests are run.
+Additionally, if the new encoder is an image encoder, you can add to the encoder list in `tests/models/encoders/viz_image_encoders.py` for saving PCA visualizations of the encoder outputs to the `local/pca_images` directory.
+## Developer Guidelines
+Please follow these guidelines when contributing to the UniCeption encoders:
+- **Consistency**: Ensure that the new encoder follows the structure and naming conventions of existing encoders.
+- **Code Style**: Follow the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) for code style.
+- **Documentation**: Add docstrings to all classes and methods.
+- **Unit Tests**: Add necessary unit tests for the encoder class.
+- **Linting**: Run `black` on your code before committing. For example, you can run `black uniception`.
+## Happy Coding!

UniCeption/uniception/models/encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Encoder Factory for UniCeption
+"""
+import os
+from uniception.models.encoders.base import (
+    EncoderGlobalRepInput,
+    EncoderInput,
+    UniCeptionEncoderBase,
+    UniCeptionViTEncoderBase,
+    ViTEncoderInput,
+    ViTEncoderNonImageInput,
+    ViTEncoderOutput,
+)
+from uniception.models.encoders.cosmos import CosmosEncoder
+from uniception.models.encoders.croco import CroCoEncoder, CroCoIntermediateFeatureReturner
+from uniception.models.encoders.dense_rep_encoder import DenseRepresentationEncoder
+from uniception.models.encoders.dinov2 import DINOv2Encoder, DINOv2IntermediateFeatureReturner
+from uniception.models.encoders.global_rep_encoder import GlobalRepresentationEncoder
+from uniception.models.encoders.naradio import NARADIOEncoder
+from uniception.models.encoders.patch_embedder import PatchEmbedder
+from uniception.models.encoders.radio import RADIOEncoder, RADIOIntermediateFeatureReturner
+# Define encoder configurations
+ENCODER_CONFIGS = {
+    "croco": {
+        "class": CroCoEncoder,
+        "intermediate_feature_returner_class": CroCoIntermediateFeatureReturner,
+        "supported_models": ["CroCov2", "DUSt3R", "MASt3R"],
+    },
+    "dense_rep_encoder": {
+        "class": DenseRepresentationEncoder,
+        "supported_models": ["Dense-Representation-Encoder"],
+    },
+    "dinov2": {
+        "class": DINOv2Encoder,
+        "intermediate_feature_returner_class": DINOv2IntermediateFeatureReturner,
+        "supported_models": ["DINOv2", "DINOv2-Registers", "DINOv2-Depth-Anythingv2"],
+    },
+    "global_rep_encoder": {
+        "class": GlobalRepresentationEncoder,
+        "supported_models": ["Global-Representation-Encoder"],
+    },
+    "patch_embedder": {
+        "class": PatchEmbedder,
+        "supported_models": ["Patch-Embedder"],
+    },
+    "radio": {
+        "class": RADIOEncoder,
+        "intermediate_feature_returner_class": RADIOIntermediateFeatureReturner,
+        "supported_models": ["RADIO", "E-RADIO"],
+    },
+    "cosmos": {
+        "class": CosmosEncoder,
+        "supported_models": ["Cosmos-Tokenizer CI8x8", "Cosmos-Tokenizer CI16x16"],
+    },
+    "naradio": {
+        "class": NARADIOEncoder,
+        "supported_models": ["RADIO"],
+    },
+    # Add other encoders here
+}
+def encoder_factory(encoder_str: str, **kwargs) -> UniCeptionEncoderBase:
+    """
+    Encoder factory for UniCeption.
+    Please use python3 -m uniception.models.encoders.list to see available encoders.
+    Args:
+        encoder_str (str): Name of the encoder to create.
+        **kwargs: Additional keyword arguments to pass to the encoder constructor.
+    Returns:
+        UniCeptionEncoderBase: An instance of the specified encoder.
+    """
+    if encoder_str not in ENCODER_CONFIGS:
+        raise ValueError(
+            f"Unknown encoder: {encoder_str}. For valid encoder_str options, please use python3 -m uniception.models.encoders.list"
+        )
+    encoder_config = ENCODER_CONFIGS[encoder_str]
+    encoder_class = encoder_config["class"]
+    return encoder_class(**kwargs)
+def feature_returner_encoder_factory(encoder_str: str, **kwargs) -> UniCeptionEncoderBase:
+    """
+    Factory for UniCeption Encoders with support for intermediate feature returning.
+    Please use python3 -m uniception.models.encoders.list to see available encoders.
+    Args:
+        encoder_str (str): Name of the encoder to create.
+        **kwargs: Additional keyword arguments to pass to the encoder constructor.
+    Returns:
+        UniCeptionEncoderBase: An instance of the specified encoder.
+    """
+    if encoder_str not in ENCODER_CONFIGS:
+        raise ValueError(
+            f"Unknown encoder: {encoder_str}. For valid encoder_str options, please use python3 -m uniception.models.encoders.list"
+        )
+    encoder_config = ENCODER_CONFIGS[encoder_str]
+    encoder_class = encoder_config["intermediate_feature_returner_class"]
+    return encoder_class(**kwargs)
+def get_available_encoders() -> list:
+    """
+    Get a list of available encoders in UniCeption.
+    Returns:
+        list: A list of available encoder names.
+    """
+    return list(ENCODER_CONFIGS.keys())
+def print_available_encoder_models():
+    """
+    Print the currently supported encoders in UniCeption.
+    """
+    print("Currently Supported Encoders in UniCeption:\nFormat -> encoder_str: supported_models")
+    for encoder_name, config in ENCODER_CONFIGS.items():
+        print(f"{encoder_name}: {', '.join(config['supported_models'])}")
+def _make_encoder_test(encoder_str: str, **kwargs) -> UniCeptionEncoderBase:
+    "Function to create encoders for testing purposes."
+    current_file_path = os.path.abspath(__file__)
+    relative_checkpoint_path = os.path.join(os.path.dirname(current_file_path), "../../../checkpoints/encoders")
+    if encoder_str == "dummy":
+        return UniCeptionEncoderBase(name="dummy", data_norm_type="dummy")
+    elif encoder_str == "croco":
+        return CroCoEncoder(
+            name="croco",
+            data_norm_type="croco",
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/CroCo_Encoder_224.pth",
+            patch_embed_cls="PatchEmbedCroCo",
+        )
+    elif encoder_str == "dust3r_224":
+        return CroCoEncoder(
+            name="dust3r_224",
+            data_norm_type="dust3r",
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/CroCo_Encoder_224_DUSt3R_linear.pth",
+            patch_embed_cls="PatchEmbedDust3R",
+        )
+    elif encoder_str == "dust3r_512":
+        return CroCoEncoder(
+            name="dust3r_512",
+            data_norm_type="dust3r",
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/CroCo_Encoder_512_DUSt3R_linear.pth",
+            patch_embed_cls="ManyAR_PatchEmbed",
+            img_size=(512, 512),
+        )
+    elif encoder_str == "dust3r_512_dpt":
+        return CroCoEncoder(
+            name="dust3r_512_dpt",
+            data_norm_type="dust3r",
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/CroCo_Encoder_512_DUSt3R_dpt.pth",
+            patch_embed_cls="ManyAR_PatchEmbed",
+            img_size=(512, 512),
+        )
+    elif encoder_str == "mast3r_512":
+        return CroCoEncoder(
+            name="mast3r_512",
+            data_norm_type="dust3r",
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/CroCo_Encoder_512_MASt3R.pth",
+            patch_embed_cls="ManyAR_PatchEmbed",
+            img_size=(512, 512),
+        )
+    elif "dinov2" in encoder_str:
+        size = encoder_str.split("_")[1]
+        size_single_cap_letter = size[0].upper()
+        if "reg" in encoder_str:
+            with_registers = True
+            pretrained_checkpoint_path = None
+        elif "dav2" in encoder_str:
+            with_registers = False
+            pretrained_checkpoint_path = (
+                f"{relative_checkpoint_path}/DINOv2_ViT{size_single_cap_letter}_DepthAnythingV2.pth"
+            )
+        else:
+            with_registers = False
+            pretrained_checkpoint_path = None
+        return DINOv2Encoder(
+            name=encoder_str.replace("_reg", ""),
+            size=size,
+            with_registers=with_registers,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+        )
+    elif "naradio" in encoder_str:
+        return NARADIOEncoder(
+            name=encoder_str,
+            model_version=encoder_str.replace("na", ""),
+        )
+    elif "radio" in encoder_str:
+        if "e-radio" in encoder_str:
+            eradio_input_shape = (224, 224)
+        else:
+            eradio_input_shape = None
+        return RADIOEncoder(
+            name=encoder_str,
+            model_version=encoder_str,
+            eradio_input_shape=eradio_input_shape,
+        )
+    elif "cosmos" in encoder_str:
+        patch_size = int(encoder_str.split("x")[-1])
+        return CosmosEncoder(
+            name=encoder_str,
+            patch_size=patch_size,
+            pretrained_checkpoint_path=f"{relative_checkpoint_path}/Cosmos-Tokenizer-CI{patch_size}x{patch_size}/encoder.pth",
+        )
+    elif "patch_embedder" in encoder_str:
+        return PatchEmbedder(
+            name=encoder_str,
+        )
+    else:
+        raise ValueError(f"Unknown encoder: {encoder_str}")
+__all__ = [
+    "encoder_factory",
+    "get_available_encoders",
+    "print_available_encoder_models",
+    "_make_encoder_test",
+    "UniCeptionEncoderBase",
+    "UniCeptionViTEncoderBase",
+    "EncoderInput",
+    "ViTEncoderInput",
+    "ViTEncoderOutput",
+]

UniCeption/uniception/models/encoders/base.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+Base Encoder Class for UniCeption
+"""
+from dataclasses import dataclass
+from typing import Optional
+import torch.nn as nn
+from jaxtyping import Float
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
+@dataclass
+class EncoderInput:
+    "Data class for Encoder Input"
+    data_norm_type: str
+    # Add other fields that are required by the specific implementation of the encoder.
+@dataclass
+class EncoderOutput:
+    "Data class for Encoder Output"
+    pass
+@dataclass
+class EncoderGlobalRepInput:
+    "Data class for Encoder Global Representation Input"
+    data: Float[Tensor, "batch channel"]
+@dataclass
+class EncoderGlobalRepOutput:
+    "Data class for Encoder Global Representation Output"
+    features: Float[Tensor, "batch enc_embed_dim"]
+class UniCeptionEncoderBase(nn.Module):
+    "Encoder Base Class for UniCeption"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str,
+        size: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Base class for all encoders in UniCeption.
+        """
+        super().__init__(*args, **kwargs)
+        self.name: str = name
+        self.size: Optional[str] = size
+        self.data_norm_type: str = data_norm_type
+    def forward(
+        self,
+        encoder_input: EncoderInput,
+    ) -> EncoderOutput:
+        """
+        Forward interface for the UniCeption encoders.
+        We expect the "data_norm_type" field to be present in the encoder_input to check for normalization type.
+        Args:
+            encoder_input (EncoderInput): Input to the encoder. We expect the following fields: "data_norm_type: str".
+                This is also includes the other fields that are required by the specific implementation of the encoder.
+        Returns:
+            EncoderOutput: Output of the encoder.
+        """
+        raise NotImplementedError
+    def _check_data_normalization_type(self, data_norm_type: str):
+        """
+        Check if the input normalization type matches the encoder's expected input data normalization type.
+        Args:
+            data_norm_type (str): Data normalization type.
+        Raises:
+            AssertionError: If the data normalization type does not match the encoder's expected input data normalization type.
+        """
+        assert (
+            data_norm_type == self.data_norm_type
+        ), f"Input normalization type {data_norm_type} does not match the encoder's normalization type {self.data_norm_type}."
+@dataclass
+class ViTEncoderInput(EncoderInput):
+    "Data class for Vision Transformer Encoder Input"
+    image: Float[Tensor, "batch channel height width"]
+@dataclass
+class ViTEncoderNonImageInput:
+    "Data class for Vision (2D-Grid) Transformer Encoder Non-Image Input"
+    data: Float[Tensor, "batch channel height width"]
+@dataclass
+class ViTEncoderOutput(EncoderOutput):
+    "Data class for Vision Transformer Encoder Output"
+    features: Float[Tensor, "batch enc_embed_dim feat_height feat_width"]
+class UniCeptionViTEncoderBase(UniCeptionEncoderBase):
+    "Vision Transformer Encoder Base Class for UniCeption"
+    def __init__(
+        self,
+        patch_size: int,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Base class for all Vision Transformer encoders in UniCeption.
+        """
+        super().__init__(*args, **kwargs)
+        self.patch_size = patch_size
+        self.gradient_checkpointing = gradient_checkpointing
+    def wrap_module_with_gradient_checkpointing(self, module: nn.Module):
+        """
+        Wrapper for Gradient Checkpointing
+        References: https://github.com/microsoft/MoGe
+        """
+        class _CheckpointingWrapper(module.__class__):
+            _restore_cls = module.__class__
+            def forward(self, *args, **kwargs):
+                return checkpoint(super().forward, *args, use_reentrant=False, **kwargs)
+        module.__class__ = _CheckpointingWrapper
+        return module
+if __name__ == "__main__":
+    dummy_model = UniCeptionEncoderBase(name="name", data_norm_type="norm")
+    dummy_vit_model = UniCeptionViTEncoderBase(name="name", data_norm_type="norm", patch_size=16)
+    print("Dummy Base Encoders created successfully!")

UniCeption/uniception/models/encoders/cosmos.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Encoder Class for Cosmos
+"""
+import torch
+from uniception.models.encoders.base import UniCeptionViTEncoderBase, ViTEncoderInput, ViTEncoderOutput
+from uniception.models.libs.cosmos_tokenizer.modules import ContinuousFormulation, EncoderType
+from uniception.models.libs.cosmos_tokenizer.networks import TokenizerConfigs
+class CosmosEncoder(UniCeptionViTEncoderBase):
+    "Uniception Cosmos Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "cosmos",
+        patch_size: int = 8,
+        pretrained_checkpoint_path: str = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Cosmos Encoder for extracting spatial features from images.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "cosmos"
+            patch_size (int): Patch size for the encoder. Default: 8
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint. Default: None
+        """
+        # Init the base class
+        super().__init__(name=name, data_norm_type=data_norm_type, patch_size=patch_size, *args, **kwargs)
+        # Init Cosmos Encoder sepecific attributes
+        tokenizer_config = TokenizerConfigs["CI"].value.copy()
+        tokenizer_config.update(dict(spatial_compression=self.patch_size))
+        z_factor = tokenizer_config["z_factor"]
+        z_channels = tokenizer_config["z_channels"]
+        latent_channels = tokenizer_config["latent_channels"]
+        encoder_name = kwargs.get("encoder", EncoderType.Default.name)
+        print(tokenizer_config)
+        del tokenizer_config["z_factor"]
+        del tokenizer_config["z_channels"]
+        del tokenizer_config["latent_channels"]
+        self.encoder = EncoderType[encoder_name].value(z_channels=z_factor * z_channels, **tokenizer_config)
+        self.quant_conv = torch.nn.Conv2d(z_factor * z_channels, z_factor * latent_channels, 1)
+        formulation_name = kwargs.get("formulation", ContinuousFormulation.AE.name)
+        self.distribution = ContinuousFormulation[formulation_name].value()
+        # Load the pretrained checkpoint
+        if pretrained_checkpoint_path is not None:
+            print(f"Loading custom pretrained Cosmos checkpoint from {pretrained_checkpoint_path}")
+            ckpt = torch.load(pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def encode(self, input_tensor: torch.Tensor) -> tuple[torch.Tensor]:
+        """Encodes an image into a latent embedding or code.
+        Args:
+            input_tensor: The input tensor Bx3xHxW layout, range [-1..1].
+        Returns:
+            For continuous image (CI) tokenizer, the tuple contains:
+                - The latent embedding, Bx16x(h)x(w), where the compression
+                rate is (H/h x W/w), and channel dimension of 16.
+            For discrete image (DI) tokenizer, the tuple contains:
+                - The indices, Bx(h)x(w), from a codebook of size 64K, which
+                corresponds to FSQ levels of (8,8,8,5,5,5).
+               - The discrete code, Bx6x(h)x(w), where the compression rate is
+                again (H/h x W/w), and channel dimension of 6.
+        """
+        x = self.encoder(input_tensor)
+        x = self.quant_conv(x)
+        output_latent = self.distribution(x)
+        if isinstance(output_latent, torch.Tensor):
+            return output_latent
+        return output_latent[:-1]
+    def forward(self, encoder_input: ViTEncoderInput) -> ViTEncoderOutput:
+        """
+        Cosmos Encoder Forward Pass
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Extract the features from the DINOv2 model
+        features = self.encode(encoder_input.image)[0].contiguous()
+        return ViTEncoderOutput(features=features)
+if __name__ == "__main__":
+    # initialize different variants of the Cosmos Encoder, untrained
+    for is_continuous in [True]:
+        for patch_size in [8, 16]:
+            encoder = CosmosEncoder(name="cosmos", patch_size=patch_size)
+    # # initialize from trained checkpoint, with/without jit inference capability
+    PRETRAINED_JIT_CHECKPOINTS = {
+        ("CI", 8): "../../../checkpoints/encoders/cosmos/Cosmos-Tokenizer-CI8x8/encoder.pth",
+        ("CI", 16): "../../../checkpoints/encoders/cosmos/Cosmos-Tokenizer-CI16x16/encoder.pth",
+    }
+    for patch_size in [8, 16]:
+        encoder = CosmosEncoder(
+            name="cosmos",
+            patch_size=patch_size,
+            pretrained_checkpoint_path=PRETRAINED_JIT_CHECKPOINTS[("CI", patch_size)],
+        )
+    # example inference
+    dummy_image = torch.randn(1, 3, 256, 256).cuda()
+    encoder_input = ViTEncoderInput(data_norm_type="cosmos", image=dummy_image)
+    encoder = encoder.cuda()
+    encoder_output = encoder(encoder_input)

UniCeption/uniception/models/encoders/croco.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""
+Encoder Class for CroCo & DUSt3R
+"""
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from uniception.models.encoders.base import UniCeptionViTEncoderBase, ViTEncoderInput, ViTEncoderOutput
+from uniception.models.libs.croco.blocks import Block
+from uniception.models.libs.croco.patch_embed import get_patch_embed
+from uniception.models.libs.croco.pos_embed import RoPE2D
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner, feature_take_indices
+class CroCoEncoder(UniCeptionViTEncoderBase):
+    "UniCeption CroCov2 Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str,
+        patch_embed_cls: str = "PatchEmbedDust3R",
+        img_size: Union[int, Tuple[int, int]] = (224, 224),
+        patch_size: int = 16,
+        enc_embed_dim: int = 1024,
+        enc_depth: int = 24,
+        enc_num_heads: int = 16,
+        mlp_ratio: int = 4,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-6),
+        pos_embed: str = "RoPE100",
+        pretrained_checkpoint_path: str = None,
+        override_checkpoint_attributes: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        References: https://github.com/naver/dust3r, https://github.com/naver/croco
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Input data normalization type.
+            patch_embed_cls (str, optional): The class to use for patch embedding.
+                Defaults to 'PatchEmbedDust3R'. Options: ['PatchEmbedCroCo', 'PatchEmbedDust3R', 'ManyAR_PatchEmbed'].
+            img_size (int, optional): The size of the input image. Defaults to 224.
+            patch_size (int, optional): The size of the patches to divide the image into. Defaults to 16.
+            enc_embed_dim (int, optional): The dimension of the encoder's embedding. Defaults to 768.
+            enc_depth (int, optional): The number of encoder layers/transformer blocks. Defaults to 12.
+            enc_num_heads (int, optional): The number of encoder heads. Defaults to 12.
+            mlp_ratio (int, optional): The MLP ratio used for the CroCo encoder transformer. Defaults to 4.
+            norm_layer (nn.Module, optional): The normalization layer to use in the transformer. Defaults to nn.LayerNorm with eps=1e-6.
+            pos_embed (str, optional): Positional Embedding. Defaults to 'RoPE100'. Options: ['RoPEfreq'].
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. Defaults to None.
+        """
+        # Init the base class
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            *args,
+            **kwargs,
+        )
+        # Init the CroCo Encoder specific attributes
+        self.patch_embed_cls = patch_embed_cls
+        self.img_size = img_size
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_depth = enc_depth
+        self.enc_num_heads = enc_num_heads
+        self.mlp_ratio = mlp_ratio
+        self.norm_layer = norm_layer
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.override_checkpoint_attributes = override_checkpoint_attributes
+        # Init the positional embedding
+        self.pos_embed = pos_embed
+        if pos_embed.startswith("RoPE"):  # eg RoPE100
+            self.enc_pos_embed = None  # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None  # nothing to add in the decoder with RoPE
+            if RoPE2D is None:
+                raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(pos_embed[len("RoPE") :])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError("Unknown pos_embed " + pos_embed)
+        # Init the patch embedding
+        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+        # Init the encoder
+        self._set_encoder(enc_depth, enc_embed_dim, enc_num_heads, mlp_ratio, norm_layer, self.rope)
+        # Initialize random weights
+        self.initialize_weights()
+        # Load the pretrained CroCo checkpoint if provided
+        if pretrained_checkpoint_path:
+            print(f"Loading pretrained CroCo checkpoint from {pretrained_checkpoint_path}")
+            ckpt = torch.load(pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+            if not override_checkpoint_attributes:
+                ckpt_data_norm_type = ckpt["data_norm_type"]
+                ckpt_patch_embed_cls = ckpt["patch_embed_cls"]
+                assert (
+                    data_norm_type == ckpt_data_norm_type
+                ), f"Data normalization type {data_norm_type} does not match the checkpoint {ckpt_data_norm_type}."
+                assert (
+                    patch_embed_cls == ckpt_patch_embed_cls
+                ), f"Patch embedding class {patch_embed_cls} does not match the checkpoint {ckpt_patch_embed_cls}."
+        else:
+            print("No pretrained checkpoint provided. Randomly initializing the CroCo encoder.")
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        "Set the patch embedding scheme"
+        self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)
+    def _set_encoder(self, enc_depth, enc_embed_dim, enc_num_heads, mlp_ratio, norm_layer, rope):
+        "Set the encoder"
+        self.enc_blocks = nn.ModuleList(
+            [
+                Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=rope)
+                for _ in range(enc_depth)
+            ]
+        )
+        self.enc_norm = norm_layer(enc_embed_dim)
+    def initialize_weights(self):
+        "Initialize the weights of the patch embedding and the transformer encoder"
+        # Patch embedding
+        self.patch_embed._init_weights()
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer encoder weights"
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, encoder_input: ViTEncoderInput) -> ViTEncoderOutput:
+        """
+        CroCov2 Encoder Forward Pass
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Get the true shape of the image for landscape/portrait mode check in patch embedding
+        batch_size, _, height, width = encoder_input.image.shape
+        if hasattr(encoder_input, "true_shape"):
+            true_shape = encoder_input.true_shape
+        else:
+            true_shape = torch.tensor([height, width])[None].repeat(batch_size, 1)
+        # Embed the image into patches
+        features, pos = self.patch_embed(encoder_input.image, true_shape=true_shape)
+        # Now apply the transformer encoder and normalization
+        for blk in self.enc_blocks:
+            features = blk(features, pos)
+        features = self.enc_norm(features)
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+class CroCoIntermediateFeatureReturner(CroCoEncoder, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption CroCo Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str,
+        patch_embed_cls: str = "PatchEmbedDust3R",
+        img_size: Union[int, Tuple[int, int]] = (224, 224),
+        patch_size: int = 16,
+        enc_embed_dim: int = 1024,
+        enc_depth: int = 24,
+        enc_num_heads: int = 16,
+        mlp_ratio: int = 4,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-6),
+        pos_embed: str = "RoPE100",
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = None,
+        norm_intermediate: bool = True,
+        stop_early: bool = False,
+        intermediates_only: bool = True,
+        *args,
+        **kwargs,
+    ):
+        """
+        Intermediate Feature Returner for the CroCo Encoder.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Input data normalization type.
+            patch_embed_cls (str, optional): The class to use for patch embedding.
+                Defaults to 'PatchEmbedDust3R'. Options: ['PatchEmbedCroCo', 'PatchEmbedDust3R', 'ManyAR_PatchEmbed'].
+            img_size (int, optional): The size of the input image. Defaults to 224.
+            patch_size (int, optional): The size of the patches to divide the image into. Defaults to 16.
+            enc_embed_dim (int, optional): The dimension of the encoder's embedding. Defaults to 768.
+            enc_depth (int, optional): The number of encoder layers/transformer blocks. Defaults to 12.
+            enc_num_heads (int, optional): The number of encoder heads. Defaults to 12.
+            mlp_ratio (int, optional): The MLP ratio used for the CroCo encoder transformer. Defaults to 4.
+            norm_layer (nn.Module, optional): The normalization layer to use in the transformer. Defaults to nn.LayerNorm with eps=1e-6.
+            pos_embed (str, optional): Positional Embedding. Defaults to 'RoPE100'. Options: ['cosine', 'RoPE100'].
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. Defaults to None.
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. Defaults to None. Options:
+            - None: Return all intermediate layers.
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. Defaults to True.
+            stop_early (bool, optional): Whether to stop early. Defaults to False.
+            intermediates_only (bool, optional): Whether to return only the intermediate features. Defaults to True.
+        """
+        # Init the base classes
+        CroCoEncoder.__init__(
+            self,
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_embed_cls=patch_embed_cls,
+            img_size=img_size,
+            patch_size=patch_size,
+            enc_embed_dim=enc_embed_dim,
+            enc_depth=enc_depth,
+            enc_num_heads=enc_num_heads,
+            mlp_ratio=mlp_ratio,
+            norm_layer=norm_layer,
+            pos_embed=pos_embed,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            stop_early=stop_early,
+            intermediates_only=intermediates_only,
+        )
+    def forward(
+        self, encoder_input: ViTEncoderInput
+    ) -> Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]:
+        """
+        CroCov2 Encoder Forward Pass with Intermediate Feature Return
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]: Output data from the encoder.
+                If `intermediates_only` is True, returns a list of intermediate features.
+                Otherwise, returns a tuple with the final features and a list of intermediate features.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Get the true shape of the image for landscape/portrait mode check in patch embedding
+        batch_size, _, height, width = encoder_input.image.shape
+        if hasattr(encoder_input, "true_shape"):
+            true_shape = encoder_input.true_shape
+        else:
+            true_shape = torch.tensor([height, width])[None].repeat(batch_size, 1)
+        # Embed the image into patches
+        features, pos = self.patch_embed(encoder_input.image, true_shape=true_shape)
+        # Get indices of the intermediate features to return
+        intermediate_features = []
+        take_indices, max_index = feature_take_indices(len(self.enc_blocks), self.indices)
+        # Get the blocks based on early stopping
+        if torch.jit.is_scripting() or not self.stop_early:  # can't slice blocks in torchscript
+            blocks = self.enc_blocks
+        else:
+            blocks = self.enc_blocks[: max_index + 1]
+        # Now apply the transformer encoder and normalization
+        for blk_idx, blk in enumerate(blocks):
+            features = blk(features, pos)
+            if blk_idx in take_indices:
+                # Normalize intermediates with final norm layer if enabled
+                intermediate_features.append(self.enc_norm(features) if self.norm_intermediate else features)
+        # Reshape the intermediate features and convert to ViTEncoderOutput class
+        intermediate_features = [
+            intermediate.permute(0, 2, 1)
+            .reshape(-1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size)
+            .contiguous()
+            for intermediate in intermediate_features
+        ]
+        intermediate_features = [ViTEncoderOutput(features=intermediate) for intermediate in intermediate_features]
+        # Return only the intermediate features if enabled
+        if self.intermediates_only:
+            return intermediate_features
+        # Normalize and reshape the final features
+        features = self.enc_norm(features)
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        final_features = ViTEncoderOutput(features=features)
+        return final_features, intermediate_features
+if __name__ == "__main__":
+    # Init the pre-trained CroCo Encoder
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_224.pth"
+    croco_encoder = CroCoEncoder(
+        name="croco",
+        data_norm_type="croco",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="PatchEmbedCroCo",
+    )
+    # Init the pre-trained DUSt3R CroCo Encoder
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_224_DUSt3R_linear.pth"
+    dust3r_encoder = CroCoEncoder(
+        name="dust3r_224",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="PatchEmbedDust3R",
+    )
+    # Init the pre-trained DUSt3R 512 linear CroCo Encoder
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_512_DUSt3R_linear.pth"
+    dust3r_encoder_512 = CroCoEncoder(
+        name="dust3r_512",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+    )
+    # Init the pre-trained DUSt3R 512 DPT CroCo Encoder
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_512_DUSt3R_dpt.pth"
+    dust3r_encoder_512_dpt = CroCoEncoder(
+        name="dust3r_512_dpt",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+    )
+    # Init the MASt3R 512 CroCo Encoder
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_512_MASt3R.pth"
+    mast3r_encoder_512 = CroCoEncoder(
+        name="mast3r_512",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+    )
+    print("All CroCo & DUSt3R Encoders have been initialized successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests...")
+    pretrained_checkpoint_path = "../../../checkpoints/encoders/CroCo_Encoder_512_DUSt3R_dpt.pth"
+    # Run the intermediate feature returner with last-n index
+    dust3r_intermediate_feature_returner = CroCoIntermediateFeatureReturner(
+        name="dust3r_512_dpt",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+        indices=6,  # Last 6 layers
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dust3r")
+    output = dust3r_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 6, "Output must have length of intermediate features equal to the number of indices"
+    # Run the intermediate feature returner with specific indices
+    dust3r_intermediate_feature_returner = CroCoIntermediateFeatureReturner(
+        name="dust3r_512_dpt",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+        indices=[0, 2, 4, 6],  # Specific layers
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dust3r")
+    output = dust3r_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 4, "Output must have length of intermediate features equal to the number of indices"
+    # Test the normalizing of intermediate features
+    dust3r_intermediate_feature_returner = CroCoIntermediateFeatureReturner(
+        name="dust3r_512_dpt",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+        indices=[-1],
+        norm_intermediate=False,
+        intermediates_only=False,
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dust3r")
+    output = dust3r_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    if not isinstance(dust3r_intermediate_feature_returner.enc_norm, torch.nn.Identity):
+        assert not torch.equal(
+            output[0].features, output[1][0].features
+        ), "Final features and intermediate features must be different"
+    dust3r_intermediate_feature_returner = CroCoIntermediateFeatureReturner(
+        name="dust3r_512_dpt",
+        data_norm_type="dust3r",
+        pretrained_checkpoint_path=pretrained_checkpoint_path,
+        patch_embed_cls="ManyAR_PatchEmbed",
+        img_size=(512, 512),
+        indices=[-1],
+        norm_intermediate=True,
+        intermediates_only=False,
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dust3r")
+    output = dust3r_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert torch.equal(
+        output[0].features, output[1][0].features
+    ), "Final features and intermediate features must be same"
+    print("All Intermediate Feature Returner Tests have passed successfully!")

UniCeption/uniception/models/encoders/dense_rep_encoder.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""
+Encoder class for Dense Representation Encoder
+"""
+import math
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from uniception.models.encoders.base import (
+    UniCeptionViTEncoderBase,
+    ViTEncoderInput,
+    ViTEncoderNonImageInput,
+    ViTEncoderOutput,
+)
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class ResidualBlock(nn.Module):
+    "Redidual block for Dense Representation Encoder"
+    def __init__(self, in_channels: int, out_channels: int, act_layer: Type[nn.Module] = nn.GELU):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.act = act_layer()
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.shortcut = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x):
+        identity = self.shortcut(x)
+        out = self.conv1(x)
+        out = self.act(out)
+        out = self.conv2(out)
+        out += identity
+        return self.act(out)
+class DenseRepresentationEncoder(UniCeptionViTEncoderBase):
+    "UniCeption Dense Representation Encoder"
+    def __init__(
+        self,
+        name: str,
+        in_chans: int = 3,
+        enc_embed_dim: int = 1024,
+        apply_pe: bool = True,
+        input_size_for_pe: Union[int, Tuple[int, int]] = 518,
+        patch_size: int = 14,
+        intermediate_dims: List[int] = [588, 768, 1024],
+        data_norm_type: str = "dense_rep_encoder",
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Optional[Callable] = partial(nn.LayerNorm, eps=1e-6),
+        post_pe_norm_layer: Optional[Callable] = partial(nn.LayerNorm, eps=1e-6),
+        interpolate_antialias: bool = False,
+        interpolate_offset: float = 0.1,
+        pretrained_checkpoint_path: str = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Dense Representation Encoder for extracting patch-wise features from a spatial input of size (B, C, H, W).
+        Uses a convolution based patchify followed by some residual blocks.
+        Also applies positional encoding with interpolation to the patch-wise features if required.
+        Args:
+            in_chans (int): Number of input channels.
+            enc_embed_dim (int): Embedding dimension of the encoder.
+            apply_pe (bool): Whether to apply positional encoding.
+            input_size_for_pe (Union[int, Tuple[int, int]]): Input size for positional encoding.
+            patch_size (int): Patch size of the encoder.
+            intermediate_dims (List[int]): Intermediate dimensions of the encoder.
+            data_norm_type (str): Data normalization type. (Used for checking if the input images are normalized correctly.)
+            act_layer (Type[nn.Module]): Activation layer.
+            norm_layer (Optional[Callable]): Normalization layer.
+            post_pe_norm_layer (Optional[Callable]): Normalization layer after positional encoding.
+            interpolate_antialias (bool): Whether to apply antialiasing in interpolation.
+            interpolate_offset (float): Offset for interpolation.
+            pretrained_checkpoint_path (str): Path to pretrained checkpoint.
+        """
+        # Init the base class
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            *args,
+            **kwargs,
+        )
+        # Init the specific attributes
+        self.in_chans = in_chans
+        self.enc_embed_dim = enc_embed_dim
+        self.intermediate_dims = intermediate_dims
+        self.apply_pe = apply_pe
+        # Initialize the encoder with a pixel unshuffle and conv projection to patchify the input
+        self.unshuffle = nn.PixelUnshuffle(self.patch_size)
+        self.conv_in = nn.Conv2d(self.in_chans * (self.patch_size**2), self.intermediate_dims[0], 3, 1, 1)
+        # Add residual blocks
+        layers = []
+        for intermediate_idx in range(len(self.intermediate_dims) - 1):
+            layers.append(
+                ResidualBlock(
+                    in_channels=self.intermediate_dims[intermediate_idx],
+                    out_channels=self.intermediate_dims[intermediate_idx + 1],
+                    act_layer=act_layer,
+                )
+            )
+        # Final projection to match encoder embeddings dim
+        layers.append(
+            nn.Conv2d(
+                in_channels=self.intermediate_dims[-1],
+                out_channels=self.enc_embed_dim,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        )
+        self.encoder = nn.Sequential(*layers)
+        # Init norm layer after encoder if required
+        self.norm_layer = norm_layer(enc_embed_dim) if norm_layer else nn.Identity()
+        if isinstance(self.norm_layer, nn.LayerNorm):
+            nn.init.constant_(self.norm_layer.bias, 0)
+            nn.init.constant_(self.norm_layer.weight, 1.0)
+        if self.apply_pe:
+            # Init the patch resolution details required for positional encoding
+            patch_HW = make_2tuple(patch_size)
+            self.input_size_for_pe = make_2tuple(input_size_for_pe)
+            self.patches_resolution = (
+                self.input_size_for_pe[0] // patch_HW[0],
+                self.input_size_for_pe[1] // patch_HW[1],
+            )
+            self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+            # Init the sinusodial positional encodings
+            self.register_buffer(
+                "pos_embed",
+                self._get_sinusoid_encoding_table(self.num_patches, self.enc_embed_dim, 70007),
+            )
+            self.interpolate_antialias = interpolate_antialias
+            self.interpolate_offset = interpolate_offset
+            # Init the norm layer after positional encoding if required
+            self.post_pe_norm = post_pe_norm_layer(enc_embed_dim) if post_pe_norm_layer else nn.Identity()
+            if isinstance(self.post_pe_norm, nn.LayerNorm):
+                nn.init.constant_(self.post_pe_norm.bias, 0)
+                nn.init.constant_(self.post_pe_norm.weight, 1.0)
+        # Load the pretrained checkpoint if provided
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        if self.pretrained_checkpoint_path:
+            print(
+                f"Loading custom pretrained Dense Representation Encoder checkpoint from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def _get_sinusoid_encoding_table(self, n_position, d_hid, base):
+        "Sinusoid position encoding table"
+        def get_position_angle_vec(position):
+            return [position / np.power(base, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
+        return torch.FloatTensor(sinusoid_table)
+    def interpolate_pos_encoding(self, features, height, width):
+        """
+        Interpolate the positional encoding to the expected size.
+        Args:
+            features (torch.Tensor): Input tensor of shape (B, N, C).
+            height (int, float): Height of the input tensor.
+            width (int, float): Width of the input tensor.
+        Returns:
+            torch.Tensor: Interpolated positional encoding tensor of shape (1, N, C).
+        """
+        previous_dtype = features.dtype
+        npatch = features.shape[1]
+        N = self.pos_embed.unsqueeze(0).shape[1]
+        if npatch == N and height == width:
+            return self.pos_embed.unsqueeze(0)
+        patch_pos_embed = self.pos_embed.unsqueeze(0).float()
+        dim = features.shape[-1]
+        height0 = height // self.patch_size
+        width0 = width // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sh = float(height0 + self.interpolate_offset) / M
+            sw = float(width0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sh, sw)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (height0, width0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (height0, width0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed.to(previous_dtype)
+    def forward(self, encoder_input: Union[ViTEncoderInput, ViTEncoderNonImageInput]) -> ViTEncoderOutput:
+        """
+        Dense Representation Encoder Forward Pass
+        Args:
+            encoder_input (Union[ViTEncoderInput, ViTEncoderNonImageInput]): Input data for the encoder.
+                If input type is ViTEncoderInput, input data must contain image normalization type and normalized image tensor.
+                If input type is ViTEncoderNonImageInput, input data must contain a tensor of size (B, C, H, W).
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Get the input data and verify normalization if the input type is ViTEncoderInput
+        if isinstance(encoder_input, ViTEncoderInput):
+            self._check_data_normalization_type(encoder_input.data_norm_type)
+            input_data = encoder_input.image
+        elif isinstance(encoder_input, ViTEncoderNonImageInput):
+            input_data = encoder_input.data
+        else:
+            raise ValueError("Unsupported input type for Dense Representation Encoder.")
+        # Check the dtype and shape of the input
+        assert isinstance(input_data, torch.Tensor), "Input must be a torch.Tensor"
+        assert input_data.ndim == 4, "Input must be of shape (B, C, H, W)"
+        assert input_data.shape[1] == self.in_chans, f"Input channels must be {self.in_chans}"
+        batch_size, channels, height, width = input_data.shape
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Encode the dense representation
+        features = self.unshuffle(input_data)
+        features = self.conv_in(features)
+        features = self.encoder(features)
+        features = features.flatten(2).transpose(
+            1, 2
+        )  # (B, E, H / Patch_Size, W / Patch_Size) -> (B, H / Patch_Size * W / Patch_Size, E)
+        features = self.norm_layer(features)  # Normalize the features after patch encoding
+        # Apply positional encoding if required
+        if self.apply_pe:
+            features = features + self.interpolate_pos_encoding(
+                features, height, width
+            )  # (B, H / Patch_Size * W / Patch_Size, E)
+            features = self.post_pe_norm(features)  # Normalize the features after positional encoding
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+if __name__ == "__main__":
+    # Init Dense Representation Encoder for images as input
+    patch_embedder = DenseRepresentationEncoder(
+        name="dense_rep_encoder",
+        data_norm_type="dense_rep_encoder",
+        input_size_for_pe=518,
+        patch_size=14,
+        in_chans=3,
+        enc_embed_dim=1024,
+        apply_pe=False,
+    )
+    # Test dummy image input
+    dummy_image = torch.randn(1, 3, 518, 518)
+    patch_embedder_output = patch_embedder(ViTEncoderInput(data_norm_type="dense_rep_encoder", image=dummy_image))
+    assert patch_embedder_output.features.shape == (
+        1,
+        1024,
+        37,
+        37,
+    ), "Output features must have shape (1, 1024, 37, 37)"
+    # Init Dense Representation Encoder for non-image data as input
+    patch_embedder = DenseRepresentationEncoder(
+        name="dense_rep_encoder",
+        data_norm_type="dense_rep_encoder",
+        input_size_for_pe=518,
+        patch_size=14,
+        in_chans=6,
+        enc_embed_dim=1024,
+    )
+    # Init Dense Representation Encoder for single channel input
+    patch_embedder = DenseRepresentationEncoder(
+        name="dense_rep_encoder",
+        data_norm_type="dense_rep_encoder",
+        input_size_for_pe=518,
+        patch_size=14,
+        in_chans=1,
+        enc_embed_dim=1024,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        apply_pe=True,
+    )
+    # Test dummy non-image input
+    dummy_image = torch.randn(1, 1, 980, 980)
+    patch_embedder_output = patch_embedder(ViTEncoderNonImageInput(data=dummy_image))
+    assert patch_embedder_output.features.shape == (
+        1,
+        1024,
+        70,
+        70,
+    ), "Output features must have shape (1, 1024, 70, 70)"
+    print("All variants of Dense Representation Encoder have been initialized successfully!")

UniCeption/uniception/models/encoders/dinov2.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Encoder Class for DINOv2
+"""
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from uniception.models.encoders.base import UniCeptionViTEncoderBase, ViTEncoderInput, ViTEncoderOutput
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner
+class DINOv2Encoder(UniCeptionViTEncoderBase):
+    "UniCeption DINOv2 Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "dinov2",
+        patch_size: int = 14,
+        size: str = "large",
+        with_registers: bool = False,
+        pretrained_checkpoint_path: str = None,
+        torch_hub_force_reload: bool = False,
+        gradient_checkpointing: bool = False,
+        keep_first_n_layers: Optional[int] = None,
+        use_pytorch_sdpa=True,
+        *args,
+        **kwargs,
+    ):
+        """
+        DINOv2 Encoder for extracting spatial features from images.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "dinov2"
+            patch_size (int): Patch size for the encoder. Default: 14
+            size (str): Size variant of the DINOv2 model. Options: ["small", "base", "large", "giant"]. Default: "large"
+            with_registers (bool): Whether to use the DINOv2 model with registers. Default: False
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of DINOv2. Default: None
+            torch_hub_force_reload (bool): Whether to force reload the model from torch hub. Default: False
+            gradient_checkpointing (bool): Whether to use gradient checkpointing to save GPU memory during backward call. Default: False
+            keep_first_n_layers (Optional[int]): If specified, only the first n layers of the model will be kept. Default: None
+            use_pytorch_sdpa (bool): Whether to use PyTorch native SDPA for attention layers. Default: True
+        """
+        # Init the base class
+        name = name if not with_registers else f"{name}_reg"
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            gradient_checkpointing=gradient_checkpointing,
+            *args,
+            **kwargs,
+        )
+        # Init the DINOv2 Encoder specific attributes
+        self.version = size
+        self.with_registers = with_registers
+        self.enc_embed_dim = {"small": 384, "base": 768, "large": 1024, "giant": 1536}[self.version]
+        # Define DINOv2 model factory
+        DINO_MODELS = {
+            # No registers
+            False: {
+                "small": "dinov2_vits14",
+                "base": "dinov2_vitb14",
+                "large": "dinov2_vitl14",
+                "giant": "dinov2_vitg14",
+            },
+            # With registers
+            True: {
+                "small": "dinov2_vits14_reg",
+                "base": "dinov2_vitb14_reg",
+                "large": "dinov2_vitl14_reg",
+                "giant": "dinov2_vitg14_reg",
+            },
+        }
+        # Load the pretrained DINOv2 model from torch hub
+        print(f"Loading pretrained {DINO_MODELS[self.with_registers][self.version]} from torch hub")
+        try:  # Requires internet access
+            self.model = torch.hub.load(
+                "facebookresearch/dinov2",
+                DINO_MODELS[self.with_registers][self.version],
+                force_reload=torch_hub_force_reload,
+            )
+        except:  # Load from cache
+            self.model = torch.hub.load("facebookresearch/dinov2", DINO_MODELS[self.with_registers][self.version])
+        del (
+            self.model.mask_token
+        )  # This parameter is unused in producing patch features, and will lead to unused parameters
+        # Keep only the first n layers of the model if keep_first_n_layers is specified
+        if keep_first_n_layers is not None:
+            self.model.blocks = nn.ModuleList(self.model.blocks[:keep_first_n_layers])
+        # Use Native Torch SDPA for attention layers if specified (instead of DINOv2's XFormers)
+        if use_pytorch_sdpa:
+            self.enable_pytorch_native_sdpa()
+        # Wrap the transformer blocks with support for gradient checkpointing if required
+        if self.gradient_checkpointing:
+            for i in range(len(self.model.blocks)):
+                self.model.blocks[i] = self.wrap_module_with_gradient_checkpointing(self.model.blocks[i])
+        # Load the custom pretrained checkpoint if provided
+        if pretrained_checkpoint_path:
+            print(f"Loading custom pretrained DINOv2 checkpoint from {pretrained_checkpoint_path}")
+            ckpt = torch.load(pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def enable_pytorch_native_sdpa(self):
+        "Enable PyTorch native SDPA for attention layers"
+        for i in range(len(self.model.blocks)):
+            self.model.blocks[i].attn = self.wrap_dinov2_attention_with_sdpa(self.model.blocks[i].attn)
+    def wrap_dinov2_attention_with_sdpa(self, module: nn.Module):
+        "Wrap DINOv2 attention module with PyTorch native SDPA"
+        assert torch.__version__ >= "2.0", "SDPA requires PyTorch 2.0 or later"
+        class _AttentionWrapper(module.__class__):
+            "SDPA Attention Wrapper Class"
+            def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                B, N, C = x.shape
+                qkv = (
+                    self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+                )  # (3, B, H, N, C // H)
+                q, k, v = torch.unbind(qkv, 0)  # (B, H, N, C // H)
+                x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+                x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+                x = self.proj(x)
+                x = self.proj_drop(x)
+                return x
+        module.__class__ = _AttentionWrapper
+        return module
+    def forward(self, encoder_input: ViTEncoderInput) -> ViTEncoderOutput:
+        """
+        DINOv2 Encoder Forward Pass
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Extract the features from the DINOv2 model
+        features = self.model.forward_features(encoder_input.image)["x_norm_patchtokens"]
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+class DINOv2IntermediateFeatureReturner(DINOv2Encoder, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption DINOv2 Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "dinov2",
+        patch_size: int = 14,
+        size: str = "large",
+        with_registers: bool = False,
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = 1,
+        keep_first_n_layers: Optional[int] = None,
+        norm_intermediate: bool = True,
+        *args,
+        **kwargs,
+    ):
+        """
+        DINOv2 Encoder for extracting spatial features from images.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "dinov2"
+            patch_size (int): Patch size for the encoder. Default: 14
+            size (str): Size variant of the DINOv2 model. Options: ["small", "base", "large", "giant"]
+            with_registers (bool): Whether to use the DINOv2 model with registers.
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of DINOv2.
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. Defaults to 1. Options:
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            keep_first_n_layers (Optional[int], optional): If specified, only the first n layers of the model will be kept. Defaults to None.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. Defaults to True.
+        """
+        # Init the base classes
+        DINOv2Encoder.__init__(
+            self,
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            size=size,
+            with_registers=with_registers,
+            keep_first_n_layers=keep_first_n_layers,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+        )
+    def forward(self, encoder_input: ViTEncoderInput) -> List[ViTEncoderOutput]:
+        """
+        DINOv2 Encoder Forward Pass with Intermediate Feature Return
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            List[ViTEncoderOutput]: Output data from the encoder. Returns a list of intermediate features.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        if self.indices is None:
+            self.indices = range(len(self.model.blocks))
+        # Extract the intermediate features from the DINOv2 model
+        intermediate_features = self.model.get_intermediate_layers(
+            encoder_input.image, n=self.indices, reshape=True, norm=self.norm_intermediate
+        )
+        # Convert the intermediate features to a list of ViTEncoderOutput
+        intermediate_features = [ViTEncoderOutput(features=features) for features in intermediate_features]
+        return intermediate_features
+if __name__ == "__main__":
+    # Init different variants of DINOv2
+    for size in ["small", "base", "large", "giant"]:
+        for with_registers in [False, True]:
+            name = f"dinov2_{size}"
+            dinov2_encoder = DINOv2Encoder(name=name, size=size, with_registers=with_registers)
+    # Init the custom pretrained DINOv2 encoders
+    for size in ["small", "base", "large"]:
+        pretrained_checkpoints_dict = {
+            "small": "../../../checkpoints/encoders/DINOv2_ViTS_DepthAnythingV2.pth",
+            "base": "../../../checkpoints/encoders/DINOv2_ViTB_DepthAnythingV2.pth",
+            "large": "../../../checkpoints/encoders/DINOv2_ViTL_DepthAnythingV2.pth",
+        }
+        name = f"dinov2_dav2_{size}"
+        dinov2_encoder = DINOv2Encoder(
+            name=name, size=size, with_registers=False, pretrained_checkpoint_path=pretrained_checkpoints_dict[size]
+        )
+    print("All DINOv2 Encoders have been initialized successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests...")
+    # Run the intermediate feature returner with last-n index
+    dinov2_intermediate_feature_returner = DINOv2IntermediateFeatureReturner(
+        name="dinov2_base", size="base", indices=6
+    )  # Last 6 layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dinov2")
+    output = dinov2_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 6, "Output must have length of intermediate features equal to the number of indices"
+    # Run the intermediate feature returner with specific indices
+    dinov2_intermediate_feature_returner = DINOv2IntermediateFeatureReturner(
+        name="dinov2_base", size="base", indices=[0, 2, 4, 6]
+    )  # Specific layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="dinov2")
+    output = dinov2_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 4, "Output must have length of intermediate features equal to the number of indices"
+    print("All Intermediate Feature Returner Tests have passed successfully!")
+    from uniception.models.encoders.utils import profile_encoder
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Profile the DINOv2 Encoder
+    dinov2_encoder = DINOv2Encoder(
+        name="dinov2_large", size="large", use_pytorch_sdpa=True, gradient_checkpointing=True, keep_first_n_layers=12
+    ).cuda()
+    dummy_input = ViTEncoderInput(image=torch.randn(24, 3, 560, 420).cuda(), data_norm_type="dinov2")
+    class Profiler:
+        @profile_encoder(num_warmup=3, num_runs=20, autocast_precision="bfloat16", use_compile=True, dynamic=False)
+        def run_fn(self):
+            output = dinov2_encoder(dummy_input)
+            return output
+    profiler = Profiler()
+    profiler.run_fn()

UniCeption/uniception/models/encoders/global_rep_encoder.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Encoder class for Global Representation Encoder
+"""
+from functools import partial
+from typing import Callable, List, Optional, Type, Union
+import torch
+import torch.nn as nn
+from uniception.models.encoders.base import EncoderGlobalRepInput, EncoderGlobalRepOutput
+class GlobalRepresentationEncoder(nn.Module):
+    "UniCeption Global Representation Encoder"
+    def __init__(
+        self,
+        name: str,
+        in_chans: int = 3,
+        enc_embed_dim: int = 1024,
+        intermediate_dims: List[int] = [128, 256, 512],
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        pretrained_checkpoint_path: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Global Representation Encoder for projecting a global representation to a desired latent dimension.
+        Args:
+            name (str): Name of the Encoder.
+            in_chans (int): Number of input channels.
+            enc_embed_dim (int): Embedding dimension of the encoder.
+            intermediate_dims (List[int]): List of intermediate dimensions of the encoder.
+            act_layer (Type[nn.Module]): Activation layer to use in the encoder.
+            norm_layer (Union[Type[nn.Module], Callable[..., nn.Module]]): Final normalization layer to use in the encoder.
+            pretrained_checkpoint_path (Optional[str]): Path to pretrained checkpoint. (default: None)
+        """
+        super().__init__(*args, **kwargs)
+        # Initialize the attributes
+        self.name = name
+        self.in_chans = in_chans
+        self.enc_embed_dim = enc_embed_dim
+        self.intermediate_dims = intermediate_dims
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        # Init the activation layer
+        self.act_layer = act_layer()
+        # Initialize the encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(self.in_chans, self.intermediate_dims[0]),
+            self.act_layer,
+        )
+        for intermediate_idx in range(1, len(self.intermediate_dims)):
+            self.encoder = nn.Sequential(
+                self.encoder,
+                nn.Linear(self.intermediate_dims[intermediate_idx - 1], self.intermediate_dims[intermediate_idx]),
+                self.act_layer,
+            )
+        self.encoder = nn.Sequential(
+            self.encoder,
+            nn.Linear(self.intermediate_dims[-1], self.enc_embed_dim),
+        )
+        # Init weights of the final norm layer
+        self.norm_layer = norm_layer(enc_embed_dim) if norm_layer else nn.Identity()
+        if isinstance(self.norm_layer, nn.LayerNorm):
+            nn.init.constant_(self.norm_layer.bias, 0)
+            nn.init.constant_(self.norm_layer.weight, 1.0)
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(
+                f"Loading pretrained Global Representation Encoder checkpoint from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def forward(self, encoder_input: EncoderGlobalRepInput) -> EncoderGlobalRepOutput:
+        """
+        Global Representation Encoder Forward Pass
+        Args:
+            encoder_input (EncoderGlobalRepInput): Input data for the encoder.
+                The provided data must contain a tensor of size (B, C).
+        Returns:
+            EncoderGlobalRepOutput: Output features from the encoder.
+        """
+        # Get the input data and verify the shape of the input
+        input_data = encoder_input.data
+        assert input_data.ndim == 2, "Input data must have shape (B, C)"
+        assert input_data.shape[1] == self.in_chans, f"Input data must have {self.in_chans} channels"
+        # Encode the global representation
+        features = self.encoder(input_data)
+        # Normalize the output
+        features = self.norm_layer(features)
+        return EncoderGlobalRepOutput(features=features)
+if __name__ == "__main__":
+    dummy_model = GlobalRepresentationEncoder(
+        name="dummy", in_chans=3, enc_embed_dim=1024, intermediate_dims=[128, 256, 512]
+    )
+    dummy_input = EncoderGlobalRepInput(data=torch.randn(4, 3))
+    dummy_output = dummy_model(dummy_input)
+    assert dummy_output.features.shape == (4, 1024), "Output features must have shape (B, 1024)"
+    print("Global Representation Encoder has been initialized successfully!")

UniCeption/uniception/models/encoders/image_normalizations.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Image normalizations for the different UniCeption image encoders.
+Image encoders defined in UniCeption must have their corresponding image normalization defined here.
+"""
+from dataclasses import dataclass
+import torch
+@dataclass
+class ImageNormalization:
+    mean: torch.Tensor
+    std: torch.Tensor
+IMAGE_NORMALIZATION_DICT = {
+    "dummy": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0])),
+    "croco": ImageNormalization(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])),
+    "dust3r": ImageNormalization(mean=torch.tensor([0.5, 0.5, 0.5]), std=torch.tensor([0.5, 0.5, 0.5])),
+    "dinov2": ImageNormalization(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])),
+    "identity": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0])),
+    "patch_embedder": ImageNormalization(
+        mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])
+    ),
+    "radio": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0])),
+    "sea_raft": ImageNormalization(
+        mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0]) / 255
+    ),  # Sea-RAFT uses 0-255 in FP32
+    "unimatch": ImageNormalization(
+        mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([1.0, 1.0, 1.0]) / 255
+    ),  # UniMatch uses 0-255 in FP32
+    "roma": ImageNormalization(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])),
+    "cosmos": ImageNormalization(mean=torch.tensor([0.0, 0.0, 0.0]), std=torch.tensor([0.5, 0.5, 0.5])),
+}

UniCeption/uniception/models/encoders/list.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+List available UniCeption encoders.
+"""
+import argparse
+from uniception.models.encoders import print_available_encoder_models
+if __name__ == "__main__":
+    print_available_encoder_models()

UniCeption/uniception/models/encoders/naradio.py ADDED Viewed

	@@ -0,0 +1,502 @@

+"""
+Encoder Class for NARADIO (RayFronts)
+"""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention.flex_attention import flex_attention
+from uniception.models.encoders.base import UniCeptionViTEncoderBase, ViTEncoderInput, ViTEncoderOutput
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner
+class GaussKernelAttn(nn.Module):
+    """Implementation of Gaussian Kernel based Attention using FlexAttention"""
+    def __init__(
+        self,
+        orig_attn,
+        gauss_std: float,
+        dim: int,
+        qk_norm: bool = False,
+        num_prefix_tokens: int = 8,
+        patch_size: int = 16,
+    ) -> None:
+        super().__init__()
+        num_heads = orig_attn.num_heads
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.addition_cache = dict()
+        self.input_resolution = None  # to be set when calling forward
+        self.gauss_std = gauss_std
+        self.patch_size = patch_size
+        self.qkv = orig_attn.qkv
+        self.q_norm = orig_attn.q_norm if qk_norm else nn.Identity()
+        self.k_norm = orig_attn.k_norm if qk_norm else nn.Identity()
+        self.attn_drop = orig_attn.attn_drop
+        self.proj = orig_attn.proj
+        self.proj_drop = orig_attn.proj_drop
+        self.num_prefix_tokens = num_prefix_tokens
+    @staticmethod
+    def gaussian_window(dim1, dim2, std=7.0):
+        constant = 1 / (std * math.sqrt(2))
+        ks = list()
+        for dim in [dim1, dim2]:
+            start = -(dim - 1) / 2.0
+            k = torch.linspace(start=start * constant, end=(start + (dim - 1)) * constant, steps=dim, dtype=torch.float)
+            ks.append(k)
+        dist_square_to_mu = (torch.stack(torch.meshgrid(*ks, indexing="ij")) ** 2).sum(0)
+        return torch.exp(-dist_square_to_mu)
+    @staticmethod
+    def get_attention_addition(dim1, dim2, window, num_prefix_tokens=8):
+        m = torch.einsum("ij,kl->ijkl", torch.eye(dim1), torch.eye(dim2))
+        m = m.permute((0, 3, 1, 2)).contiguous()
+        out = F.conv2d(m.view(-1, dim1, dim2).unsqueeze(1), window.unsqueeze(0).unsqueeze(1), padding="same").squeeze(1)
+        out = out.view(dim1 * dim2, dim1 * dim2)
+        if num_prefix_tokens > 0:
+            v_adjusted = torch.vstack([torch.zeros((num_prefix_tokens, dim1 * dim2)), out])
+            out = torch.hstack([torch.zeros((dim1 * dim2 + num_prefix_tokens, num_prefix_tokens)), v_adjusted])
+        return out
+    def prepare_gaussian_addition(self, n_patches, device):
+        """Prepare the Gaussian addition matrix for the current input"""
+        # Check if we have a cached addition matrix for these dimensions
+        if n_patches not in self.addition_cache:
+            window_size = [side * 2 - 1 for side in n_patches]
+            window = self.gaussian_window(*window_size, std=self.gauss_std)
+            addition = self.get_attention_addition(*n_patches, window, self.num_prefix_tokens).to(device)
+            # Cache the addition matrix
+            self.addition_cache[n_patches] = addition
+        # Return the cached addition matrix
+        return self.addition_cache[n_patches]
+    def gauss_score_mod(self, score, b, h, q_idx, kv_idx, addition):
+        """Score modification function for FlexAttention"""
+        # Adding the precomputed Gaussian pattern to the attention score
+        return score + addition[q_idx, kv_idx]
+    def set_input_resolution(self, input_resolution: Tuple[int, int]):
+        """Set the input resolution for the Gaussian attention window"""
+        self.input_resolution = input_resolution
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        assert self.input_resolution is not None, "input_resolution must be set before forward pass"
+        h, w = self.input_resolution
+        n_patches = (w // self.patch_size, h // self.patch_size)
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = k.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = v.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        addition = self.prepare_gaussian_addition(n_patches, device=x.device)
+        # Create a score_mod function with the current addition matrix
+        score_mod = lambda score, b, h, q_idx, kv_idx: self.gauss_score_mod(score, b, h, q_idx, kv_idx, addition)
+        # Use FlexAttention
+        attn_output = flex_attention(q, k, v, score_mod=score_mod)
+        # Reshape output and apply projection
+        attn_output = attn_output.transpose(1, 2).reshape(B, N, C)
+        attn_output = self.proj(attn_output)
+        attn_output = self.proj_drop(attn_output)
+        return attn_output
+class NARADIOEncoder(UniCeptionViTEncoderBase):
+    """
+    UniCeption NARADIO (RayFronts) Encoder based on NACLIP & RADIO
+    The model modifies the attention of the last layer of RADIO following NACLIP,
+    thereby improving the spatial patch features.
+    """
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "radio",
+        patch_size: int = 16,
+        model_version: str = "radio_v2.5-l",
+        gauss_std: float = 7.0,
+        pretrained_checkpoint_path: str = None,
+        eradio_input_shape: Optional[tuple] = None,
+        torch_hub_force_reload: bool = False,
+        keep_first_n_layers: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        NARADIO Encoder for extracting spatial features from images.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "radio"
+            patch_size (int): Patch size for the encoder. Default: 16
+            model_version (str): Version of the RADIO model to load. Default: "radio_v2.5-l"
+            gauss_std: Standard deviation of the gaussian kernel. Default: 7.0
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of RADIO. Default: None
+            eradio_input_shape (tuple): Input shape (height, width) for E-RADIO models. Default: None
+            torch_hub_force_reload (bool): Whether to force reload the model from torch hub. Default: False
+            keep_first_n_layers (Optional[int]): Number of layers to keep from the pretrained model. Default: None
+        """
+        # Init the base class
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            *args,
+            **kwargs,
+        )
+        # Init the RADIO Encoder specific attributes
+        self.model_version = model_version
+        self.enc_embed_dim = {
+            "radio_v2.5-b": 768,
+            "radio_v2.5-l": 1024,
+            "radio_v2.5-h": 1280,
+            "radio_v2.5-g": 1536,
+            "e-radio_v2": 1536,
+        }[self.model_version]
+        if self.model_version == "radio_v2.5-g":
+            assert patch_size == 14, "Patch size must be 14 for RADIO v2.5-g"
+        else:
+            assert patch_size == 16, "Patch size must be 16 for all other versions of RADIO"
+        # Load the pretrained RADIO model from torch hub
+        print(f"Loading pretrained {self.model_version} from torch hub")
+        try:  # Requires internet access
+            self.model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+                force_reload=torch_hub_force_reload,
+            )
+        except:  # Load from cache
+            self.model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+            )
+        # Delete the excess blocks if keep_first_n_layers is specified
+        if keep_first_n_layers is not None:
+            assert keep_first_n_layers < len(
+                self.model.model.blocks
+            ), "keep_first_n_layers must be less than the number of blocks"
+            print(f"Keeping only the first {keep_first_n_layers} layers of the model")
+            self.model.model.blocks = torch.nn.ModuleList(self.model.model.blocks[:keep_first_n_layers])
+        # Set the optimal window size for E-RADIO models
+        if "e-radio" in self.model_version:
+            assert eradio_input_shape is not None, "Input shape (height, width) must be provided for E-RADIO models"
+            self.model.model.set_optimal_window_size(eradio_input_shape)
+        # Load the custom pretrained checkpoint if provided
+        if pretrained_checkpoint_path is not None:
+            print(f"Loading custom pretrained NARADIO checkpoint from {pretrained_checkpoint_path}")
+            ckpt = torch.load(pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+        # Replace the attention of the last ViT block with the Gaussian Kernel based attention
+        self.model.model.blocks[-1] = GaussKernelAttn(
+            self.model.model.blocks[-1].attn,
+            gauss_std,
+            dim=self.enc_embed_dim,
+            num_prefix_tokens=self.model.num_summary_tokens,
+            patch_size=self.patch_size,
+        )
+    def forward(self, encoder_input: ViTEncoderInput) -> ViTEncoderOutput:
+        """
+        NARADIO Encoder Forward Pass
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Set input resolution for Gaussian attention
+        self.model.model.blocks[-1].set_input_resolution((height, width))
+        # Forward pass throught the RADIO encoder
+        summary, features = self.model(encoder_input.image)
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+class NARADIOIntermediateFeatureReturner(NARADIOEncoder, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption NARADIO Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "radio",
+        patch_size: int = 16,
+        model_version: str = "radio_v2.5-l",
+        gauss_std: float = 7.0,
+        pretrained_checkpoint_path: str = None,
+        eradio_input_shape: Optional[tuple] = None,
+        indices: Union[int, List[int]] = [-1],
+        norm_intermediate: bool = True,
+        stop_early: bool = False,
+        intermediates_only: bool = True,
+        feature_adaptor: Optional[str] = None,
+        keep_first_n_layers: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Intermediate Feature Returner for the NARADIO Encoder.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "radio"
+            patch_size (int): Patch size for the encoder. Default: 16
+            model_version (str): Version of the RADIO model to load. Default: "radio_v2.5-l"
+            gauss_std (float): Standard deviation of the gaussian kernel. Default: 7.0
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of RADIO.
+            eradio_input_shape (tuple): Input shape (height, width) for E-RADIO models. Default: None
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. Defaults to [-1]. Options:
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. Defaults to True.
+            stop_early (bool, optional): Whether to stop early. Defaults to False.
+            intermediates_only (bool, optional): Whether to return only the intermediate features. Defaults to True.
+            feature_adaptor (Optional[str], optional): Feature adaptor to use. Defaults to None. Currently supported: "dino_v2".
+            keep_first_n_layers (Optional[int], optional): Number of layers to keep from the pretrained model. Defaults to None.
+        """
+        # Init the base classes
+        NARADIOEncoder.__init__(
+            self,
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            model_version=model_version,
+            gauss_std=gauss_std,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            eradio_input_shape=eradio_input_shape,
+            keep_first_n_layers=keep_first_n_layers,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            stop_early=stop_early,
+            intermediates_only=intermediates_only,
+        )
+        # Convert indices to absolute indices if indices is None
+        if self.indices is None:
+            self.indices = list(range(len(self.model.model.blocks)))
+        self.feature_adaptor = feature_adaptor
+        if self.feature_adaptor is None:
+            pass
+        elif self.feature_adaptor == "dino_v2":
+            # Initialize a dummy radio encoder with the adaptor setting
+            dummy_model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+                adaptor_names="dino_v2",
+            )
+            # Extract its feature converter weights
+            self.spatial_feature_converter = dummy_model.adaptors["dino_v2"].feat_mlp
+            # Update the embedding dimension because the features have been projected
+            self.enc_embed_dim = self.spatial_feature_converter.final[-1].out_features
+            del dummy_model
+        else:
+            raise ValueError("Unsupported feature adaptor. Supported: dino_v2")
+    def forward(
+        self, encoder_input: ViTEncoderInput
+    ) -> Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]:
+        """
+        NARADIO Encoder Forward Pass with Intermediate Feature Return
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]: Output data from the encoder.
+                If `intermediates_only` is True, returns a list of intermediate features.
+                Otherwise, returns a tuple with the final features and a list of intermediate features.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Set input resolution for Gaussian attention
+        self.model.model.blocks[-1].set_input_resolution((height, width))
+        # Extract the final features and intermediate features accordingly
+        model_outputs = self.model.forward_intermediates(
+            encoder_input.image,
+            indices=self.indices,
+            return_prefix_tokens=False,
+            norm=self.norm_intermediate,
+            stop_early=self.stop_early,
+            output_fmt="NLC",
+            intermediates_only=self.intermediates_only,
+        )
+        # Extract the final features and intermediate features accordingly
+        final_features, intermediate_features = None, None
+        if self.intermediates_only:
+            intermediate_features = model_outputs
+        else:
+            final_features = model_outputs[0].features.contiguous()
+            intermediate_features = model_outputs[1]
+        # Optionally convert the features using the feature adaptor
+        Hp, Wp = height // self.patch_size, width // self.patch_size
+        # Convert final features
+        if final_features is not None:
+            if self.feature_adaptor is not None:
+                final_features = self.spatial_feature_converter(final_features)
+            # Convert to BCHW and package
+            final_features = final_features.view(batch_size, Hp, Wp, -1).permute(0, 3, 1, 2)
+            final_features = ViTEncoderOutput(features=final_features)
+        # Convert intermediate features
+        if intermediate_features is not None:
+            num_intermediate = len(intermediate_features)
+            all_intermediate_feats_tensor = torch.cat(intermediate_features, dim=0)
+            if self.feature_adaptor is not None:
+                all_intermediate_feats_tensor = self.spatial_feature_converter(all_intermediate_feats_tensor)
+            # Convert to BCHW
+            all_intermediate_feats_tensor = all_intermediate_feats_tensor.view(
+                num_intermediate * batch_size, Hp, Wp, -1
+            ).permute(0, 3, 1, 2)
+            all_intermediate_feats = torch.chunk(all_intermediate_feats_tensor, num_intermediate, dim=0)
+            intermediate_features = [ViTEncoderOutput(features=x) for x in all_intermediate_feats]
+        # Return the final features and intermediate features accordingly
+        if self.intermediates_only:
+            return intermediate_features
+        else:
+            return final_features, intermediate_features
+if __name__ == "__main__":
+    # Init different versions of the RADIO Encoder
+    for model_version in ["radio_v2.5-b", "radio_v2.5-l"]:
+        naradio_encoder = NARADIOEncoder(name="NARADIOv2.5", model_version=model_version)
+    print("All NARADIO Encoders have been initialized successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests...")
+    # Run the intermediate feature returner with last-n index
+    naradio_intermediate_feature_returner = NARADIOIntermediateFeatureReturner(
+        name="NARADIOv2.5", model_version="radio_v2.5-b", indices=6
+    )  # Last 6 layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = naradio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 6, "Output must have length of intermediate features equal to the number of indices"
+    # Run the intermediate feature returner with specific indices
+    naradio_intermediate_feature_returner = NARADIOIntermediateFeatureReturner(
+        name="NARADIOv2.5", model_version="radio_v2.5-b", indices=[0, 2, 4, 6]
+    )  # Specific layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = naradio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 4, "Output must have length of intermediate features equal to the number of indices"
+    # Test the normalizing of intermediate features
+    naradio_intermediate_feature_returner = NARADIOIntermediateFeatureReturner(
+        name="NARADIOv2.5", model_version="radio_v2.5-b", norm_intermediate=False, intermediates_only=False
+    )  # Do not normalize
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = naradio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    if not isinstance(naradio_intermediate_feature_returner.model.model.norm, torch.nn.Identity):
+        assert not torch.equal(
+            output[0].features, output[1][0].features
+        ), "Final features and intermediate features must be different"
+    naradio_intermediate_feature_returner = NARADIOIntermediateFeatureReturner(
+        name="NARADIOv2.5", model_version="radio_v2.5-b", norm_intermediate=True, intermediates_only=False
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = naradio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert torch.equal(
+        output[0].features, output[1][0].features
+    ), "Final features and intermediate features must be same"
+    print("All Intermediate Feature Returner Tests have passed successfully!")

UniCeption/uniception/models/encoders/patch_embedder.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Encoder class for Patch Embedder
+"""
+import math
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from uniception.models.encoders.base import (
+    UniCeptionViTEncoderBase,
+    ViTEncoderInput,
+    ViTEncoderNonImageInput,
+    ViTEncoderOutput,
+)
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbedder(UniCeptionViTEncoderBase):
+    "UniCeption Patch Embedder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "patch_embedder",
+        input_size: Union[int, Tuple[int, int]] = 518,
+        patch_size: int = 14,
+        in_chans: int = 3,
+        enc_embed_dim: int = 1024,
+        norm_layer: Optional[Callable] = None,
+        post_pe_norm_layer: Optional[Callable] = partial(nn.LayerNorm, eps=1e-6),
+        interpolate_antialias: bool = False,
+        interpolate_offset: float = 0.1,
+        pretrained_checkpoint_path: str = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Patch Encoder for extracting patch-wise features from a spatial input of size (B, C, H, W).
+        Learnable positional encoding is also applied to the patch-wise features.
+        """
+        # Init the base class
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            *args,
+            **kwargs,
+        )
+        # Init the Patch Embedder specific attributes
+        patch_HW = make_2tuple(patch_size)
+        self.input_size = make_2tuple(input_size)
+        self.patches_resolution = (self.input_size[0] // patch_HW[0], self.input_size[1] // patch_HW[1])
+        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.enc_embed_dim = enc_embed_dim
+        # Init the Patch Embedder layers
+        self.proj = nn.Conv2d(in_chans, enc_embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(enc_embed_dim) if norm_layer else nn.Identity()
+        # Init the learnable positional encodings
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, self.enc_embed_dim))
+        trunc_normal_(self.pos_embed, std=0.02)
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        # Init the norm layer after positional encoding
+        self.post_pe_norm = post_pe_norm_layer(enc_embed_dim) if post_pe_norm_layer else nn.Identity()
+        # Load the pretrained checkpoint if provided
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        if self.pretrained_checkpoint_path:
+            print(f"Loading custom pretrained Patch Embedder checkpoint from {self.pretrained_checkpoint_path} ...")
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def interpolate_pos_encoding(self, features, height, width):
+        """
+        Interpolate the positional encoding to the expected size.
+        Args:
+            features (torch.Tensor): Input tensor of shape (B, N, C).
+            height (int, float): Height of the input tensor.
+            width (int, float): Width of the input tensor.
+        Returns:
+            torch.Tensor: Interpolated positional encoding tensor of shape (1, N, C).
+        """
+        previous_dtype = features.dtype
+        npatch = features.shape[1]
+        N = self.pos_embed.shape[1]
+        if npatch == N and height == width:
+            return self.pos_embed
+        patch_pos_embed = self.pos_embed.float()
+        dim = features.shape[-1]
+        height0 = height // self.patch_size
+        width0 = width // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sh = float(height0 + self.interpolate_offset) / M
+            sw = float(width0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sh, sw)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (height0, width0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (height0, width0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed.to(previous_dtype)
+    def forward(self, encoder_input: Union[ViTEncoderInput, ViTEncoderNonImageInput]) -> ViTEncoderOutput:
+        """
+        Patch Embedder Forward Pass
+        Args:
+            encoder_input (Union[ViTEncoderInput, ViTEncoderNonImageInput]): Input data for the encoder.
+                If input type is ViTEncoderInput, input data must contain image normalization type and normalized image tensor.
+                If input type is ViTEncoderNonImageInput, input data must contain a tensor of size (B, C, H, W).
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Get the input data and verify normalization if the input type is ViTEncoderInput
+        if isinstance(encoder_input, ViTEncoderInput):
+            self._check_data_normalization_type(encoder_input.data_norm_type)
+            input_data = encoder_input.image
+        elif isinstance(encoder_input, ViTEncoderNonImageInput):
+            input_data = encoder_input.data
+        else:
+            raise ValueError("Unsupported input type for Patch Embedder.")
+        # Check the dtype and shape of the input
+        assert isinstance(input_data, torch.Tensor), "Input must be a torch.Tensor"
+        assert input_data.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = input_data.shape
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Patchify the input data and project into expected latent space
+        features = self.proj(input_data)  # (B, C, H, W) -> (B, E, H / Patch_Size, W / Patch_Size)
+        features = features.flatten(2).transpose(
+            1, 2
+        )  # (B, E, H / Patch_Size, W / Patch_Size) -> (B, H / Patch_Size * W / Patch_Size, E)
+        features = self.norm(features)  # Normalize the features after patch embedding
+        features = features + self.interpolate_pos_encoding(
+            features, height, width
+        )  # (B, H / Patch_Size * W / Patch_Size, E)
+        features = self.post_pe_norm(features)  # Normalize the features after positional encoding
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+if __name__ == "__main__":
+    # Init Patch Embedder for images as input
+    patch_embedder = PatchEmbedder(
+        name="patch_embedder",
+        data_norm_type="patch_embedder",
+        input_size=518,
+        patch_size=14,
+        in_chans=3,
+        enc_embed_dim=1024,
+    )
+    # Test dummy image input
+    dummy_image = torch.randn(1, 3, 518, 518)
+    patch_embedder_output = patch_embedder(ViTEncoderInput(data_norm_type="patch_embedder", image=dummy_image))
+    assert patch_embedder_output.features.shape == (
+        1,
+        1024,
+        37,
+        37,
+    ), "Output features must have shape (1, 1024, 37, 37)"
+    # Init Patch Embedder for non-image data as input
+    patch_embedder = PatchEmbedder(
+        name="patch_embedder",
+        data_norm_type="patch_embedder",
+        input_size=518,
+        patch_size=14,
+        in_chans=6,
+        enc_embed_dim=1024,
+    )
+    # Init Patch Embedder for single channel input
+    patch_embedder = PatchEmbedder(
+        name="patch_embedder",
+        data_norm_type="patch_embedder",
+        input_size=518,
+        patch_size=14,
+        in_chans=1,
+        enc_embed_dim=1024,
+    )
+    # Test dummy non-image input
+    dummy_image = torch.randn(1, 1, 518, 518)
+    patch_embedder_output = patch_embedder(ViTEncoderNonImageInput(data=dummy_image))
+    assert patch_embedder_output.features.shape == (
+        1,
+        1024,
+        37,
+        37,
+    ), "Output features must have shape (1, 1024, 37, 37)"
+    print("All variants of Patch Embedder have been initialized successfully!")

UniCeption/uniception/models/encoders/radio.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+Encoder Class for RADIO (Nvidia)
+"""
+from typing import List, Optional, Tuple, Union
+import torch
+from uniception.models.encoders.base import UniCeptionViTEncoderBase, ViTEncoderInput, ViTEncoderOutput
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner
+class RADIOEncoder(UniCeptionViTEncoderBase):
+    "UniCeption RADIO Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "radio",
+        patch_size: int = 16,
+        model_version: str = "radio_v2.5-l",
+        pretrained_checkpoint_path: str = None,
+        eradio_input_shape: Optional[tuple] = None,
+        torch_hub_force_reload: bool = False,
+        keep_first_n_layers: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        RADIO Encoder for extracting spatial features from images.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "radio"
+            patch_size (int): Patch size for the encoder. Default: 16
+            model_version (str): Version of the RADIO model to load. Default: "radio_v2.5-l"
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of RADIO. Default: None
+            eradio_input_shape (tuple): Input shape (height, width) for E-RADIO models. Default: None
+            torch_hub_force_reload (bool): Whether to force reload the model from torch hub. Default: False
+            keep_first_n_layers (Optional[int]): Number of layers to keep from the pretrained model. Default: None
+        """
+        # Init the base class
+        super().__init__(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            *args,
+            **kwargs,
+        )
+        # Init the RADIO Encoder specific attributes
+        self.model_version = model_version
+        self.enc_embed_dim = {
+            "radio_v2.5-b": 768,
+            "radio_v2.5-l": 1024,
+            "radio_v2.5-h": 1280,
+            "radio_v2.5-g": 1536,
+            "e-radio_v2": 1536,
+        }[self.model_version]
+        if self.model_version == "radio_v2.5-g":
+            assert patch_size == 14, "Patch size must be 14 for RADIO v2.5-g"
+        else:
+            assert patch_size == 16, "Patch size must be 16 for all other versions of RADIO"
+        # Load the pretrained RADIO model from torch hub
+        print(f"Loading pretrained {self.model_version} from torch hub")
+        try:  # Requires internet access
+            self.model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+                force_reload=torch_hub_force_reload,
+            )
+        except:  # Load from cache
+            self.model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+            )
+        # Delete the excess blocks if keep_first_n_layers is specified
+        if keep_first_n_layers is not None:
+            assert keep_first_n_layers < len(
+                self.model.model.blocks
+            ), "keep_first_n_layers must be less than the number of blocks"
+            print(f"Keeping only the first {keep_first_n_layers} layers of the model")
+            self.model.model.blocks = torch.nn.ModuleList(self.model.model.blocks[:keep_first_n_layers])
+        # Set the optimal window size for E-RADIO models
+        if "e-radio" in self.model_version:
+            assert eradio_input_shape is not None, "Input shape (height, width) must be provided for E-RADIO models"
+            self.model.model.set_optimal_window_size(eradio_input_shape)
+        # Load the custom pretrained checkpoint if provided
+        if pretrained_checkpoint_path is not None:
+            print(f"Loading custom pretrained RADIO checkpoint from {pretrained_checkpoint_path}")
+            ckpt = torch.load(pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def forward(self, encoder_input: ViTEncoderInput) -> ViTEncoderOutput:
+        """
+        RADIO Encoder Forward Pass
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            ViTEncoderOutput: Output data from the encoder.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Forward pass throught the RADIO encoder
+        summary, features = self.model(encoder_input.image)
+        # Resize the features to the expected shape
+        # (B x Num_patches x Embed_dim) -> (B x Embed_dim x H / Patch_Size x W / Patch_Size)
+        features = features.permute(0, 2, 1)
+        features = features.reshape(
+            -1, self.enc_embed_dim, height // self.patch_size, width // self.patch_size
+        ).contiguous()
+        return ViTEncoderOutput(features=features)
+class RADIOIntermediateFeatureReturner(RADIOEncoder, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption RADIO Encoder"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "radio",
+        patch_size: int = 16,
+        model_version: str = "radio_v2.5-l",
+        pretrained_checkpoint_path: str = None,
+        eradio_input_shape: Optional[tuple] = None,
+        indices: Union[int, List[int]] = [-1],
+        norm_intermediate: bool = True,
+        stop_early: bool = False,
+        intermediates_only: bool = True,
+        feature_adaptor: Optional[str] = None,
+        keep_first_n_layers: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Intermediate Feature Returner for the RADIO Encoder.
+        Args:
+            name (str): Name of the encoder.
+            data_norm_type (str): Image normalization type. Default: "radio"
+            patch_size (int): Patch size for the encoder. Default: 16
+            model_version (str): Version of the RADIO model to load. Default: "radio_v2.5-l"
+            pretrained_checkpoint_path (str): Path to the pretrained checkpoint if using custom trained version of RADIO.
+            eradio_input_shape (tuple): Input shape (height, width) for E-RADIO models. Default: None
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. Defaults to [-1]. Options:
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. Defaults to True.
+            stop_early (bool, optional): Whether to stop early. Defaults to False.
+            intermediates_only (bool, optional): Whether to return only the intermediate features. Defaults to True.
+            feature_adaptor (Optional[str], optional): Feature adaptor to use. Defaults to None. Currently supported: "dino_v2".
+            keep_first_n_layers (Optional[int], optional): Number of layers to keep from the pretrained model. Defaults to None.
+        """
+        # Init the base classes
+        RADIOEncoder.__init__(
+            self,
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_size=patch_size,
+            model_version=model_version,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            eradio_input_shape=eradio_input_shape,
+            keep_first_n_layers=keep_first_n_layers,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            stop_early=stop_early,
+            intermediates_only=intermediates_only,
+        )
+        # Convert indices to absolute indices if indices is None
+        if self.indices is None:
+            self.indices = list(range(len(self.model.model.blocks)))
+        self.feature_adaptor = feature_adaptor
+        if self.feature_adaptor is None:
+            pass
+        elif self.feature_adaptor == "dino_v2":
+            # Initialize a dummy radio encoder with the adaptor setting
+            dummy_model = torch.hub.load(
+                "NVlabs/RADIO",
+                "radio_model",
+                version=self.model_version,
+                progress=True,
+                skip_validation=True,
+                adaptor_names="dino_v2",
+            )
+            # Extract its feature converter weights
+            self.spatial_feature_converter = dummy_model.adaptors["dino_v2"].feat_mlp
+            # Update the embedding dimension because the features have been projected
+            self.enc_embed_dim = self.spatial_feature_converter.final[-1].out_features
+            del dummy_model
+        else:
+            raise ValueError("Unsupported feature adaptor. Supported: dino_v2")
+    def forward(
+        self, encoder_input: ViTEncoderInput
+    ) -> Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]:
+        """
+        RADIO Encoder Forward Pass with Intermediate Feature Return
+        Args:
+            encoder_input (ViTEncoderInput): Input data for the encoder. Input data must contain image normalization type and normalized image tensor.
+        Returns:
+            Union[List[ViTEncoderOutput], Tuple[ViTEncoderOutput, List[ViTEncoderOutput]]]: Output data from the encoder.
+                If `intermediates_only` is True, returns a list of intermediate features.
+                Otherwise, returns a tuple with the final features and a list of intermediate features.
+        """
+        # Check image normalization type
+        self._check_data_normalization_type(encoder_input.data_norm_type)
+        # Check the dtype and shape of the input image
+        assert isinstance(encoder_input.image, torch.Tensor), "Input must be a torch.Tensor"
+        assert encoder_input.image.ndim == 4, "Input must be of shape (B, C, H, W)"
+        batch_size, channels, height, width = encoder_input.image.shape
+        assert channels == 3, "Input must have 3 channels"
+        assert (
+            height % self.patch_size == 0 and width % self.patch_size == 0
+        ), f"Input shape must be divisible by patch size: {self.patch_size}"
+        # Extract the final features and intermediate features accordingly
+        model_outputs = self.model.forward_intermediates(
+            encoder_input.image,
+            indices=self.indices,
+            return_prefix_tokens=False,
+            norm=self.norm_intermediate,
+            stop_early=self.stop_early,
+            output_fmt="NLC",
+            intermediates_only=self.intermediates_only,
+        )
+        # Extract the final features and intermediate features accordingly
+        final_features, intermediate_features = None, None
+        if self.intermediates_only:
+            intermediate_features = model_outputs
+        else:
+            final_features = model_outputs[0].features.contiguous()
+            intermediate_features = model_outputs[1]
+        # Optionally convert the features using the feature adaptor
+        Hp, Wp = height // self.patch_size, width // self.patch_size
+        # Convert final features
+        if final_features is not None:
+            if self.feature_adaptor is not None:
+                final_features = self.spatial_feature_converter(final_features)
+            # Convert to BCHW and package
+            final_features = final_features.view(batch_size, Hp, Wp, -1).permute(0, 3, 1, 2)
+            final_features = ViTEncoderOutput(features=final_features)
+        # Convert intermediate features
+        if intermediate_features is not None:
+            num_intermediate = len(intermediate_features)
+            all_intermediate_feats_tensor = torch.cat(intermediate_features, dim=0)
+            if self.feature_adaptor is not None:
+                all_intermediate_feats_tensor = self.spatial_feature_converter(all_intermediate_feats_tensor)
+            # Convert to BCHW
+            all_intermediate_feats_tensor = all_intermediate_feats_tensor.view(
+                num_intermediate * batch_size, Hp, Wp, -1
+            ).permute(0, 3, 1, 2)
+            all_intermediate_feats = torch.chunk(all_intermediate_feats_tensor, num_intermediate, dim=0)
+            intermediate_features = [ViTEncoderOutput(features=x) for x in all_intermediate_feats]
+        # Return the final features and intermediate features accordingly
+        if self.intermediates_only:
+            return intermediate_features
+        else:
+            return final_features, intermediate_features
+if __name__ == "__main__":
+    # Init different versions of the RADIO Encoder
+    for model_version in ["radio_v2.5-b", "radio_v2.5-l"]:
+        radio_encoder = RADIOEncoder(name="RADIOv2.5", model_version=model_version)
+    # Init the E-RADIO Encoder
+    eradio_input_shape = (512, 512)
+    eradio_encoder = RADIOEncoder(name="E-RADIO", model_version="e-radio_v2", eradio_input_shape=eradio_input_shape)
+    print("All RADIO Encoders have been initialized successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests...")
+    # Run the intermediate feature returner with last-n index
+    radio_intermediate_feature_returner = RADIOIntermediateFeatureReturner(
+        name="RADIOv2.5", model_version="radio_v2.5-b", indices=6
+    )  # Last 6 layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = radio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 6, "Output must have length of intermediate features equal to the number of indices"
+    # Run the intermediate feature returner with specific indices
+    radio_intermediate_feature_returner = RADIOIntermediateFeatureReturner(
+        name="RADIOv2.5", model_version="radio_v2.5-b", indices=[0, 2, 4, 6]
+    )  # Specific layers
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = radio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, list), "Output must be a list of intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert len(output) == 4, "Output must have length of intermediate features equal to the number of indices"
+    # Test the normalizing of intermediate features
+    radio_intermediate_feature_returner = RADIOIntermediateFeatureReturner(
+        name="RADIOv2.5", model_version="radio_v2.5-b", norm_intermediate=False, intermediates_only=False
+    )  # Do not normalize
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = radio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    if not isinstance(radio_intermediate_feature_returner.model.model.norm, torch.nn.Identity):
+        assert not torch.equal(
+            output[0].features, output[1][0].features
+        ), "Final features and intermediate features must be different"
+    radio_intermediate_feature_returner = RADIOIntermediateFeatureReturner(
+        name="RADIOv2.5", model_version="radio_v2.5-b", norm_intermediate=True, intermediates_only=False
+    )
+    dummy_input = ViTEncoderInput(image=torch.randn(1, 3, 224, 224), data_norm_type="radio")
+    output = radio_intermediate_feature_returner(dummy_input)
+    assert isinstance(output, tuple), "Output must be a tuple with final features and intermediate features"
+    assert isinstance(output[0], ViTEncoderOutput), "First element of output must be the final features"
+    assert isinstance(output[1], list), "Second element of output must be a list of intermediate features"
+    assert isinstance(output[1][0], ViTEncoderOutput), "Output must be a list of ViTEncoderOutput"
+    assert torch.equal(
+        output[0].features, output[1][0].features
+    ), "Final features and intermediate features must be same"
+    print("All Intermediate Feature Returner Tests have passed successfully!")

UniCeption/uniception/models/encoders/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Utility functions for UniCeption Encoders.
+"""
+import functools
+import numpy as np
+import torch
+def profile_encoder(num_warmup=3, num_runs=20, autocast_precision="float16", use_compile=False, dynamic=True):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            device = "cuda"
+            autocast_dtype = getattr(torch, autocast_precision)
+            # Compile the model if requested
+            if use_compile:
+                compiled_func = torch.compile(func, dynamic=dynamic, mode="max-autotune")
+            else:
+                compiled_func = func
+            with torch.autocast("cuda", dtype=autocast_dtype):
+                # Warm-up runs
+                for _ in range(num_warmup):
+                    output = compiled_func(self, *args, **kwargs)
+                    if isinstance(output, torch.Tensor):
+                        output.sum().backward()
+                    else:
+                        output.features.sum().backward()
+                    torch.cuda.synchronize()
+                # Clear memory cache
+                torch.cuda.empty_cache()
+                # Lists to store results
+                forward_times, backward_times, memory_usages = [], [], []
+                for _ in range(num_runs):
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+                    torch.cuda.reset_peak_memory_stats()
+                    memory_before = torch.cuda.max_memory_allocated(device)
+                    # Forward pass
+                    start_event.record()
+                    output = compiled_func(self, *args, **kwargs)
+                    end_event.record()
+                    torch.cuda.synchronize()
+                    forward_times.append(start_event.elapsed_time(end_event))
+                    # Backward pass
+                    start_event.record()
+                    if isinstance(output, torch.Tensor):
+                        output.sum().backward()
+                    else:
+                        output.features.sum().backward()
+                    end_event.record()
+                    torch.cuda.synchronize()
+                    backward_times.append(start_event.elapsed_time(end_event))
+                    memory_after = torch.cuda.max_memory_allocated(device)
+                    memory_usages.append((memory_after - memory_before) / 1e6)  # Convert to MB
+            # Compute mean and standard deviation
+            fwd_mean, fwd_std = np.mean(forward_times), np.std(forward_times)
+            bwd_mean, bwd_std = np.mean(backward_times), np.std(backward_times)
+            mem_mean, mem_std = np.mean(memory_usages), np.std(memory_usages)
+            compile_status = (
+                "with torch.compile (dynamic=True)"
+                if use_compile and dynamic
+                else "with torch.compile (dynamic=False)" if use_compile else "without torch.compile"
+            )
+            print(f"Profiling results {compile_status}:")
+            print(f"Forward Pass Time: {fwd_mean:.2f} ± {fwd_std:.2f} ms")
+            print(f"Backward Pass Time: {bwd_mean:.2f} ± {bwd_std:.2f} ms")
+            print(f"Peak GPU Memory Usage: {mem_mean:.2f} ± {mem_std:.2f} MB")
+            return output
+        return wrapper
+    return decorator

UniCeption/uniception/models/factory/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from uniception.models.factory.dust3r import DUSt3R
2	+
3	+ __all__ = ["DUSt3R"]

UniCeption/uniception/models/factory/dust3r.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from typing import List, Tuple
+import torch
+import torch.nn as nn
+from uniception.models.encoders import ViTEncoderInput
+from uniception.models.encoders.croco import CroCoEncoder
+from uniception.models.encoders.image_normalizations import IMAGE_NORMALIZATION_DICT
+from uniception.models.info_sharing.base import MultiViewTransformerInput
+from uniception.models.info_sharing.cross_attention_transformer import (
+    MultiViewCrossAttentionTransformer,
+    MultiViewCrossAttentionTransformerIFR,
+)
+from uniception.models.libs.croco.pos_embed import RoPE2D, get_2d_sincos_pos_embed
+from uniception.models.prediction_heads.adaptors import PointMapWithConfidenceAdaptor
+from uniception.models.prediction_heads.base import AdaptorInput, PredictionHeadInput, PredictionHeadLayeredInput
+from uniception.models.prediction_heads.dpt import DPTFeature, DPTRegressionProcessor
+from uniception.models.prediction_heads.linear import LinearFeature
+def is_symmetrized(gt1, gt2):
+    "Function to check if input pairs are symmetrized, i.e., (a, b) and (b, a) always exist in the input"
+    x = gt1["instance"]
+    y = gt2["instance"]
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i + 1]) and (x[i + 1] == y[i])
+    return ok
+def interleave(tensor1, tensor2):
+    "Interleave two tensors along the first dimension (used to avoid redundant encoding for symmetrized pairs)"
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+class DUSt3R(nn.Module):
+    "DUSt3R defined with UniCeption Modules"
+    def __init__(
+        self,
+        name: str,
+        data_norm_type: str = "dust3r",
+        img_size: tuple = (224, 224),
+        patch_embed_cls: str = "PatchEmbedDust3R",
+        pred_head_type: str = "linear",
+        pred_head_output_dim: int = 4,
+        pred_head_feature_dim: int = 256,
+        depth_mode: Tuple[str, float, float] = ("exp", -float("inf"), float("inf")),
+        conf_mode: Tuple[str, float, float] = ("exp", 1, float("inf")),
+        pos_embed: str = "RoPE100",
+        pretrained_checkpoint_path: str = None,
+        pretrained_encoder_checkpoint_path: str = None,
+        pretrained_info_sharing_checkpoint_path: str = None,
+        pretrained_pred_head_checkpoint_paths: List[str] = [None, None],
+        pretrained_pred_head_regressor_checkpoint_paths: List[str] = [None, None],
+        override_encoder_checkpoint_attributes: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Two-view model containing siamese encoders followed by a two-view cross-attention transformer and respective downstream heads.
+        The goal is to output scene representation directly, both images in view1's frame (hence the asymmetry).
+        Args:
+            name (str): Name of the model.
+            data_norm_type (str): Type of data normalization. (default: "dust3r")
+            img_size (tuple): Size of input images. (default: (224, 224))
+            patch_embed_cls (str): Class for patch embedding. (default: "PatchEmbedDust3R"). Options:
+            - "PatchEmbedDust3R"
+            - "ManyAR_PatchEmbed"
+            pred_head_type (str): Type of prediction head. (default: "linear"). Options:
+            - "linear"
+            - "dpt"
+            pred_head_output_dim (int): Output dimension of prediction head. (default: 4)
+            pred_head_feature_dim (int): Feature dimension of prediction head. (default: 256)
+            depth_mode (Tuple[str, float, float]): Depth mode settings (mode=['linear', 'square', 'exp'], vmin, vmax). (default: ('exp', -inf, inf))
+            conf_mode (Tuple[str, float, float]): Confidence mode settings (mode=['linear', 'square', 'exp'], vmin, vmax). (default: ('exp', 1, inf))
+            pos_embed (str): Position embedding type. (default: 'RoPE100')
+            landscape_only (bool): Run downstream head only in landscape orientation. (default: True)
+            pretrained_checkpoint_path (str): Path to pretrained checkpoint. (default: None)
+            pretrained_encoder_checkpoint_path (str): Path to pretrained encoder checkpoint. (default: None)
+            pretrained_info_sharing_checkpoint_path (str): Path to pretrained info_sharing checkpoint. (default: None)
+            pretrained_pred_head_checkpoint_paths (List[str]): Paths to pretrained prediction head checkpoints. (default: None)
+            pretrained_pred_head_regressor_checkpoint_paths (List[str]): Paths to pretrained prediction head regressor checkpoints. (default: None)
+            override_encoder_checkpoint_attributes (bool): Whether to override encoder checkpoint attributes. (default: False)
+        """
+        super().__init__(*args, **kwargs)
+        # Initalize the attributes
+        self.name = name
+        self.data_norm_type = data_norm_type
+        self.img_size = img_size
+        self.patch_embed_cls = patch_embed_cls
+        self.pred_head_type = pred_head_type
+        self.pred_head_output_dim = pred_head_output_dim
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        self.pos_embed = pos_embed
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.pretrained_encoder_checkpoint_path = pretrained_encoder_checkpoint_path
+        self.pretrained_info_sharing_checkpoint_path = pretrained_info_sharing_checkpoint_path
+        self.pretrained_pred_head_checkpoint_paths = pretrained_pred_head_checkpoint_paths
+        self.pretrained_pred_head_regressor_checkpoint_paths = pretrained_pred_head_regressor_checkpoint_paths
+        self.override_encoder_checkpoint_attributes = override_encoder_checkpoint_attributes
+        # Initialize RoPE for the CroCo Encoder & Two-View Cross Attention Transformer
+        freq = float(pos_embed[len("RoPE") :])
+        self.rope = RoPE2D(freq=freq)
+        # Initialize Encoder
+        self.encoder = CroCoEncoder(
+            name=name,
+            data_norm_type=data_norm_type,
+            patch_embed_cls=patch_embed_cls,
+            img_size=img_size,
+            pretrained_checkpoint_path=pretrained_encoder_checkpoint_path,
+            override_checkpoint_attributes=override_encoder_checkpoint_attributes,
+        )
+        # Initialize Multi-View Cross Attention Transformer
+        if self.pred_head_type == "linear":
+            # Returns only normalized last layer features
+            self.info_sharing = MultiViewCrossAttentionTransformer(
+                name="base_info_sharing",
+                input_embed_dim=self.encoder.enc_embed_dim,
+                num_views=2,
+                custom_positional_encoding=self.rope,
+                pretrained_checkpoint_path=pretrained_info_sharing_checkpoint_path,
+            )
+        elif self.pred_head_type == "dpt":
+            # Returns intermediate features and normalized last layer features
+            self.info_sharing = MultiViewCrossAttentionTransformerIFR(
+                name="base_info_sharing",
+                input_embed_dim=self.encoder.enc_embed_dim,
+                num_views=2,
+                indices=[5, 8],
+                norm_intermediate=False,
+                custom_positional_encoding=self.rope,
+                pretrained_checkpoint_path=pretrained_info_sharing_checkpoint_path,
+            )
+        else:
+            raise ValueError(f"Invalid prediction head type: {pred_head_type}. Must be 'linear' or 'dpt'.")
+        # Initialize Prediction Heads
+        if pred_head_type == "linear":
+            # Initialize Prediction Head 1
+            self.head1 = LinearFeature(
+                input_feature_dim=self.info_sharing.dim,
+                output_dim=pred_head_output_dim,
+                patch_size=self.encoder.patch_size,
+                pretrained_checkpoint_path=pretrained_pred_head_checkpoint_paths[0],
+            )
+            # Initialize Prediction Head 2
+            self.head2 = LinearFeature(
+                input_feature_dim=self.info_sharing.dim,
+                output_dim=pred_head_output_dim,
+                patch_size=self.encoder.patch_size,
+                pretrained_checkpoint_path=pretrained_pred_head_checkpoint_paths[1],
+            )
+        elif pred_head_type == "dpt":
+            # Initialze Predction Head 1
+            self.dpt_feature_head1 = DPTFeature(
+                patch_size=self.encoder.patch_size,
+                hooks=[0, 1, 2, 3],
+                input_feature_dims=[self.encoder.enc_embed_dim] + [self.info_sharing.dim] * 3,
+                feature_dim=pred_head_feature_dim,
+                pretrained_checkpoint_path=pretrained_pred_head_checkpoint_paths[0],
+            )
+            self.dpt_regressor_head1 = DPTRegressionProcessor(
+                input_feature_dim=pred_head_feature_dim,
+                output_dim=pred_head_output_dim,
+                pretrained_checkpoint_path=pretrained_pred_head_regressor_checkpoint_paths[0],
+            )
+            self.head1 = nn.Sequential(self.dpt_feature_head1, self.dpt_regressor_head1)
+            # Initialize Prediction Head 2
+            self.dpt_feature_head2 = DPTFeature(
+                patch_size=self.encoder.patch_size,
+                hooks=[0, 1, 2, 3],
+                input_feature_dims=[self.encoder.enc_embed_dim] + [self.info_sharing.dim] * 3,
+                feature_dim=pred_head_feature_dim,
+                pretrained_checkpoint_path=pretrained_pred_head_checkpoint_paths[1],
+            )
+            self.dpt_regressor_head2 = DPTRegressionProcessor(
+                input_feature_dim=pred_head_feature_dim,
+                output_dim=pred_head_output_dim,
+                pretrained_checkpoint_path=pretrained_pred_head_regressor_checkpoint_paths[1],
+            )
+            self.head2 = nn.Sequential(self.dpt_feature_head2, self.dpt_regressor_head2)
+        # Initialize Final Output Adaptor
+        self.adaptor = PointMapWithConfidenceAdaptor(
+            name="pointmap",
+            pointmap_mode=depth_mode[0],
+            pointmap_vmin=depth_mode[1],
+            pointmap_vmax=depth_mode[2],
+            confidence_type=conf_mode[0],
+            confidence_vmin=conf_mode[1],
+            confidence_vmax=conf_mode[2],
+        )
+        # Load pretrained weights
+        if self.pretrained_checkpoint_path is not None:
+            print(f"Loading pretrained DUSt3R weights from {self.pretrained_checkpoint_path} ...")
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def _encode_image_pairs(self, img1, img2, data_norm_type):
+        "Encode two different batches of images (each batch can have different image shape)"
+        if img1.shape[-2:] == img2.shape[-2:]:
+            encoder_input = ViTEncoderInput(image=torch.cat((img1, img2), dim=0), data_norm_type=data_norm_type)
+            encoder_output = self.encoder(encoder_input)
+            out, out2 = encoder_output.features.chunk(2, dim=0)
+        else:
+            encoder_input = ViTEncoderInput(image=img1, data_norm_type=data_norm_type)
+            out = self.encoder(encoder_input)
+            out = out.features
+            encoder_input2 = ViTEncoderInput(image=img2)
+            out2 = self.encoder(encoder_input2)
+            out2 = out2.features
+        return out, out2
+    def _encode_symmetrized(self, view1, view2):
+        "Encode image pairs accounting for symmetrization, i.e., (a, b) and (b, a) always exist in the input"
+        img1 = view1["img"]
+        img2 = view2["img"]
+        if is_symmetrized(view1, view2):
+            # Computing half of forward pass'
+            feat1, feat2 = self._encode_image_pairs(img1[::2], img2[::2], data_norm_type=view1["data_norm_type"])
+            feat1, feat2 = interleave(feat1, feat2)
+        else:
+            feat1, feat2 = self._encode_image_pairs(img1, img2, data_norm_type=view1["data_norm_type"])
+        return feat1, feat2
+    def _downstream_head(self, head_num, decout, img_shape):
+        "Run the respective prediction heads"
+        head = getattr(self, f"head{head_num}")
+        if self.pred_head_type == "linear":
+            head_input = PredictionHeadInput(last_feature=decout[f"{head_num}"])
+        elif self.pred_head_type == "dpt":
+            head_input = PredictionHeadLayeredInput(list_features=decout[f"{head_num}"], target_output_shape=img_shape)
+        return head(head_input)
+    def forward(self, view1, view2):
+        """
+        Forward pass for DUSt3R performing the following operations:
+        1. Encodes the two input views (images).
+        2. Combines the encoded features using a two-view cross-attention transformer.
+        3. Passes the combined features through the respective prediction heads.
+        4. Returns the processed final outputs for both views.
+        Args:
+            view1 (dict): Dictionary containing the first view's images and instance information.
+                          "img" is a required key and value is a tensor of shape (B, C, H, W).
+            view2 (dict): Dictionary containing the second view's images and instance information.
+                          "img" is a required key and value is a tensor of shape (B, C, H, W).
+        Returns:
+            Tuple[dict, dict]: A tuple containing the final outputs for both views.
+        """
+        # Get input shapes
+        _, _, height1, width1 = view1["img"].shape
+        _, _, height2, width2 = view2["img"].shape
+        shape1 = (int(height1), int(width1))
+        shape2 = (int(height2), int(width2))
+        # Encode the two images --> Each feat output: BCHW features (batch_size, feature_dim, feature_height, feature_width)
+        feat1, feat2 = self._encode_symmetrized(view1, view2)
+        # Combine all images into view-centric representation
+        info_sharing_input = MultiViewTransformerInput(features=[feat1, feat2])
+        if self.pred_head_type == "linear":
+            final_info_sharing_multi_view_feat = self.info_sharing(info_sharing_input)
+        elif self.pred_head_type == "dpt":
+            final_info_sharing_multi_view_feat, intermediate_info_sharing_multi_view_feat = self.info_sharing(
+                info_sharing_input
+            )
+        if self.pred_head_type == "linear":
+            # Define feature dictionary for linear head
+            info_sharing_outputs = {
+                "1": final_info_sharing_multi_view_feat.features[0].float(),
+                "2": final_info_sharing_multi_view_feat.features[1].float(),
+            }
+        elif self.pred_head_type == "dpt":
+            # Define feature dictionary for DPT head
+            info_sharing_outputs = {
+                "1": [
+                    feat1.float(),
+                    intermediate_info_sharing_multi_view_feat[0].features[0].float(),
+                    intermediate_info_sharing_multi_view_feat[1].features[0].float(),
+                    final_info_sharing_multi_view_feat.features[0].float(),
+                ],
+                "2": [
+                    feat2.float(),
+                    intermediate_info_sharing_multi_view_feat[0].features[1].float(),
+                    intermediate_info_sharing_multi_view_feat[1].features[1].float(),
+                    final_info_sharing_multi_view_feat.features[1].float(),
+                ],
+            }
+        # Downstream task prediction
+        with torch.autocast("cuda", enabled=False):
+            # Prediction heads
+            head_output1 = self._downstream_head(1, info_sharing_outputs, shape1)
+            head_output2 = self._downstream_head(2, info_sharing_outputs, shape2)
+            # Post-process outputs
+            final_output1 = self.adaptor(
+                AdaptorInput(adaptor_feature=head_output1.decoded_channels, output_shape_hw=shape1)
+            )
+            final_output2 = self.adaptor(
+                AdaptorInput(adaptor_feature=head_output2.decoded_channels, output_shape_hw=shape2)
+            )
+            # Convert outputs to dictionary
+            res1 = {
+                "pts3d": final_output1.value.permute(0, 2, 3, 1).contiguous(),
+                "conf": final_output1.confidence.permute(0, 2, 3, 1).contiguous(),
+            }
+            res2 = {
+                "pts3d_in_other_view": final_output2.value.permute(0, 2, 3, 1).contiguous(),
+                "conf": final_output2.confidence.permute(0, 2, 3, 1).contiguous(),
+            }
+        return res1, res2

UniCeption/uniception/models/info_sharing/README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# UniCeption Information Sharing Blocks
+## Currently Supported Information Sharing Architectures
+### UniCeptionInfoSharingBase:
+- `MultiViewCrossAttentionTransformer`
+   - `MultiViewCrossAttentionTransformerIFR`
+- `MultiViewGlobalAttentionTransformer`
+   - `MultiViewGlobalAttentionTransformerIFR`
+- `MultiViewAlternatingAttentionTransformer`
+   - `MultiViewAlternatingAttentionTransformerIFR`
+## Developer Guidelines
+Please follow the main UniCeption developer guidelines described in `README.md` when contributing to the information sharing blocks. Make sure to test your different implementations and add necessary unit tests.
+## Happy Coding!

UniCeption/uniception/models/info_sharing/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from uniception.models.info_sharing.alternating_attention_transformer import (
+    MultiViewAlternatingAttentionTransformer,
+    MultiViewAlternatingAttentionTransformerIFR,
+)
+from uniception.models.info_sharing.cross_attention_transformer import (
+    MultiViewCrossAttentionTransformer,
+    MultiViewCrossAttentionTransformerIFR,
+    MultiViewTransformerInput,
+)
+from uniception.models.info_sharing.diff_cross_attention_transformer import (
+    DifferentialMultiViewCrossAttentionTransformer,
+    DifferentialMultiViewCrossAttentionTransformerIFR,
+)
+from uniception.models.info_sharing.global_attention_transformer import (
+    MultiViewGlobalAttentionTransformer,
+    MultiViewGlobalAttentionTransformerIFR,
+)
+INFO_SHARING_CLASSES = {
+    "cross_attention": (MultiViewCrossAttentionTransformer, MultiViewCrossAttentionTransformerIFR),
+    "diff_cross_attention": (
+        DifferentialMultiViewCrossAttentionTransformer,
+        DifferentialMultiViewCrossAttentionTransformerIFR,
+    ),
+    "alternating_attention": (
+        MultiViewAlternatingAttentionTransformer,
+        MultiViewAlternatingAttentionTransformerIFR,
+    ),
+    "global_attention": (
+        MultiViewGlobalAttentionTransformer,
+        MultiViewGlobalAttentionTransformerIFR,
+    ),
+}
+__all__ = ["INFO_SHARING_CLASSES", "MultiViewTransformerInput"]

UniCeption/uniception/models/info_sharing/alternating_attention_transformer.py ADDED Viewed

	@@ -0,0 +1,944 @@

+"""
+UniCeption Alternating-Attention Transformer for Information Sharing
+"""
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from uniception.models.info_sharing.base import (
+    MultiViewTransformerInput,
+    MultiViewTransformerOutput,
+    UniCeptionInfoSharingBase,
+)
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner, feature_take_indices
+from uniception.models.utils.positional_encoding import PositionGetter
+from uniception.models.utils.transformer_blocks import Mlp, SelfAttentionBlock
+class MultiViewAlternatingAttentionTransformer(UniCeptionInfoSharingBase):
+    "UniCeption Multi-View Alternating-Attention Transformer for information sharing across image features from different views."
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        use_pe_for_non_reference_views: bool = False,
+        max_num_views_for_pe: int = 1000,
+        use_rand_idx_pe_for_non_reference_views: bool = True,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: Type[nn.Module] = Mlp,
+        custom_positional_encoding: Optional[Callable] = None,
+        pretrained_checkpoint_path: Optional[str] = None,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Alternating-Attention Transformer for information sharing across image features from different views.
+        Alternates between global and frame-level attention.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            use_pe_for_non_reference_views (bool): Whether to use view positional encoding for input non-referenec views. (default: False)
+            max_num_views_for_pe (int): Maximum number of views for positional encoding. (default: 1000)
+            use_rand_idx_pe_for_non_reference_views (bool): Whether to use random index positional encoding for non-reference views. (default: True)
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: True)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Initialize the base class
+        super().__init__(name=name, size=size, *args, **kwargs)
+        # Initialize the specific attributes of the transformer
+        self.input_embed_dim = input_embed_dim
+        self.use_pe_for_non_reference_views = use_pe_for_non_reference_views
+        self.max_num_views_for_pe = max_num_views_for_pe
+        self.use_rand_idx_pe_for_non_reference_views = use_rand_idx_pe_for_non_reference_views
+        self.depth = depth
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.proj_drop = proj_drop
+        self.attn_drop = attn_drop
+        self.init_values = init_values
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.mlp_layer = mlp_layer
+        self.custom_positional_encoding = custom_positional_encoding
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.gradient_checkpointing = gradient_checkpointing
+        # Initialize the projection layer for input embeddings
+        if self.input_embed_dim != self.dim:
+            self.proj_embed = nn.Linear(self.input_embed_dim, self.dim, bias=True)
+        else:
+            self.proj_embed = nn.Identity()
+        # Initialize the self-attention blocks which ingest all views at once
+        self.self_attention_blocks = nn.ModuleList(
+            [
+                SelfAttentionBlock(
+                    dim=self.dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    qk_norm=self.qk_norm,
+                    proj_drop=self.proj_drop,
+                    attn_drop=self.attn_drop,
+                    init_values=self.init_values,
+                    drop_path=self.drop_path,
+                    act_layer=self.act_layer,
+                    norm_layer=self.norm_layer,
+                    mlp_layer=self.mlp_layer,
+                    custom_positional_encoding=self.custom_positional_encoding,
+                )
+                for _ in range(self.depth)
+            ]
+        )
+        # Initialize the final normalization layer
+        self.norm = self.norm_layer(self.dim)
+        # Initialize the position getter for patch positions if required
+        if self.custom_positional_encoding is not None:
+            self.position_getter = PositionGetter()
+        if self.use_pe_for_non_reference_views:
+            # Initialize the positional encoding table for the different views
+            self.register_buffer(
+                "view_pos_table",
+                self._get_sinusoid_encoding_table(self.max_num_views_for_pe, self.dim, 10000),
+            )
+        else:
+            # Initialize the positional encoding table for the reference view
+            self.register_buffer(
+                "view_pos_table",
+                self._get_sinusoid_encoding_table(1, self.dim, 10000),
+            )
+        # Initialize random weights
+        self.initialize_weights()
+        # Apply gradient checkpointing if enabled
+        if self.gradient_checkpointing:
+            for i, block in enumerate(self.self_attention_blocks):
+                self.self_attention_blocks[i] = self.wrap_module_with_gradient_checkpointing(block)
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(
+                f"Loading pretrained multi-view Alternating-Attention transformer weights from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def _get_sinusoid_encoding_table(self, n_position, d_hid, base):
+        "Sinusoid position encoding table"
+        def get_position_angle_vec(position):
+            return [position / np.power(base, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
+        return torch.FloatTensor(sinusoid_table)
+    def initialize_weights(self):
+        "Initialize weights of the transformer."
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer linear and layer norm weights."
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> MultiViewTransformerOutput:
+        """
+        Forward interface for the Multi-View Alternating-Attention Transformer.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+                Optionally, the input can also include additional_input_tokens (e.g., class token, registers, pose tokens, scale token)
+                which are appended to the token set from the multi-view features. The tokens are of size (batch, input_embed_dim, num_of_additional_tokens).
+        Returns:
+            MultiViewTransformerOutput: Output of the model post information sharing.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        if self.use_pe_for_non_reference_views:
+            assert (
+                len(model_input.features) <= self.max_num_views_for_pe
+            ), f"Expected less than {self.max_num_views_for_pe} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Initialize the multi-view features from the model input and number of views for current input
+        multi_view_features = model_input.features
+        num_of_views = len(multi_view_features)
+        batch_size, _, height, width = multi_view_features[0].shape
+        num_of_tokens_per_view = height * width
+        # Stack the multi-view features (N, C, H, W) to (N, V, C, H, W) (assumes all V views have same shape)
+        multi_view_features = torch.stack(multi_view_features, dim=1)
+        # Resize the multi-view features from NVCHW to NLC, where L = V * H * W
+        multi_view_features = multi_view_features.permute(0, 1, 3, 4, 2)  # (N, V, H, W, C)
+        multi_view_features = multi_view_features.reshape(
+            batch_size, num_of_views * height * width, self.input_embed_dim
+        ).contiguous()
+        # Process additional input tokens if provided
+        if model_input.additional_input_tokens is not None:
+            additional_tokens = model_input.additional_input_tokens
+            assert additional_tokens.ndim == 3, "Additional tokens must have 3 dimensions (N, C, T)"
+            assert (
+                additional_tokens.shape[1] == self.input_embed_dim
+            ), f"Additional tokens must have input dimension {self.input_embed_dim}"
+            assert additional_tokens.shape[0] == batch_size, "Batch size mismatch for additional tokens"
+            # Reshape to channel-last format for transformer processing
+            additional_tokens = additional_tokens.permute(0, 2, 1).contiguous()  # (N, C, T) -> (N, T, C)
+            # Concatenate the additional tokens to the multi-view features
+            multi_view_features = torch.cat([multi_view_features, additional_tokens], dim=1)
+        # Project input features to the transformer dimension
+        multi_view_features = self.proj_embed(multi_view_features)
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, multi_view_features.device)
+            ] * num_of_views  # List of length V, where each tensor is (N, H * W, C)
+            multi_view_positions = torch.cat(multi_view_positions, dim=1)  # (N, V * H * W, C)
+        else:
+            multi_view_positions = [None] * num_of_views
+        # Add None positions for additional tokens if they exist
+        if model_input.additional_input_tokens is not None:
+            additional_tokens_positions = [None] * model_input.additional_input_tokens.shape[1]
+            multi_view_positions = multi_view_positions + additional_tokens_positions
+        # Add positional encoding for reference view (idx 0)
+        ref_view_pe = self.view_pos_table[0].clone().detach()
+        ref_view_pe = ref_view_pe.reshape((1, 1, self.dim))
+        ref_view_pe = ref_view_pe.repeat(batch_size, num_of_tokens_per_view, 1)
+        ref_view_features = multi_view_features[:, :num_of_tokens_per_view, :]
+        ref_view_features = ref_view_features + ref_view_pe
+        if self.use_pe_for_non_reference_views:
+            # Add positional encoding for non-reference views (sequential indices starting from idx 1 or random indices which are uniformly sampled)
+            if self.use_rand_idx_pe_for_non_reference_views:
+                non_ref_view_pe_indices = torch.randint(low=1, high=self.max_num_views_for_pe, size=(num_of_views - 1,))
+            else:
+                non_ref_view_pe_indices = torch.arange(1, num_of_views)
+            non_ref_view_pe = self.view_pos_table[non_ref_view_pe_indices].clone().detach()
+            non_ref_view_pe = non_ref_view_pe.reshape((1, num_of_views - 1, self.dim))
+            non_ref_view_pe = non_ref_view_pe.repeat_interleave(num_of_tokens_per_view, dim=1)
+            non_ref_view_pe = non_ref_view_pe.repeat(batch_size, 1, 1)
+            non_ref_view_features = multi_view_features[
+                :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+            ]
+            non_ref_view_features = non_ref_view_features + non_ref_view_pe
+        else:
+            non_ref_view_features = multi_view_features[
+                :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+            ]
+        # Concatenate the reference and non-reference view features
+        # Handle additional tokens (no view-based positional encoding for them)
+        if model_input.additional_input_tokens is not None:
+            additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features, additional_features], dim=1)
+        else:
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features], dim=1)
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            if depth_idx % 2 == 0:
+                # Apply the self-attention block and update the multi-view features
+                # Global attention across all views
+                multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+            else:
+                # Handle additional tokens separately for frame-level attention
+                additional_features = None
+                additional_positions = None
+                if model_input.additional_input_tokens is not None:
+                    # Extract additional token features
+                    additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+                    # Keep only view features for frame-level attention
+                    multi_view_features = multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+                    # Handle positions for additional tokens if custom positional encoding is used
+                    if self.custom_positional_encoding is not None:
+                        additional_positions = multi_view_positions[:, num_of_views * num_of_tokens_per_view :, :]
+                        multi_view_positions = multi_view_positions[:, : num_of_views * num_of_tokens_per_view, :]
+                # Reshape the multi-view features from (N, V * H * W, C) to (N * V, H * W, C)
+                multi_view_features = multi_view_features.reshape(
+                    batch_size * num_of_views, num_of_tokens_per_view, self.dim
+                ).contiguous()  # (N * V, H * W, C)
+                if multi_view_positions[0] is not None:
+                    multi_view_positions = multi_view_positions.reshape(
+                        batch_size * num_of_views, num_of_tokens_per_view, 2
+                    ).contiguous()  # (N * V, H * W, C)
+                # Apply the self-attention block and update the multi-view features
+                # Frame-level attention within each view
+                multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+                # Reshape the multi-view features from (N * V, H * W, C) back to (N, V * H * W, C)
+                multi_view_features = multi_view_features.reshape(
+                    batch_size, num_of_views * num_of_tokens_per_view, self.dim
+                ).contiguous()  # (N, V * H * W, C)
+                if multi_view_positions[0] is not None:
+                    multi_view_positions = multi_view_positions.reshape(
+                        batch_size, num_of_views * num_of_tokens_per_view, 2
+                    ).contiguous()  # (N, V * H * W, C)
+                # Reattach additional tokens if they exist
+                if additional_features is not None:
+                    multi_view_features = torch.cat([multi_view_features, additional_features], dim=1)
+                    # Reattach positions for additional tokens if they exist
+                    if additional_positions is not None:
+                        multi_view_positions = torch.cat([multi_view_positions, additional_positions], dim=1)
+        # Normalize the output features
+        output_multi_view_features = self.norm(multi_view_features)
+        # Extract only the view features (excluding additional tokens)
+        view_features = output_multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+        # Reshape the output multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+        view_features = view_features.reshape(batch_size, num_of_views, height, width, self.dim)  # (N, V, H, W, C)
+        view_features = view_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+        # Split the output multi-view features into separate views
+        view_features = view_features.split(1, dim=1)
+        view_features = [output_view_features.squeeze(dim=1) for output_view_features in view_features]
+        # Extract and return additional token features if provided
+        if model_input.additional_input_tokens is not None:
+            additional_token_features = output_multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+            return MultiViewTransformerOutput(
+                features=view_features, additional_token_features=additional_token_features
+            )
+        else:
+            return MultiViewTransformerOutput(features=view_features)
+class MultiViewAlternatingAttentionTransformerIFR(
+    MultiViewAlternatingAttentionTransformer, IntermediateFeatureReturner
+):
+    "Intermediate Feature Returner for UniCeption Multi-View Alternating-Attention Transformer"
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        use_pe_for_non_reference_views: bool = False,
+        max_num_views_for_pe: int = 1000,
+        use_rand_idx_pe_for_non_reference_views: bool = True,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: nn.Module = Mlp,
+        custom_positional_encoding: Callable = None,
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = None,
+        norm_intermediate: bool = True,
+        intermediates_only: bool = False,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Alternating-Attention Transformer for information sharing across image features from different views.
+        Extends the base class to return intermediate features.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            use_pe_for_non_reference_views (bool): Whether to use view positional encoding for input non-referenec views. (default: False)
+            max_num_views_for_pe (int): Maximum number of views for positional encoding. (default: 1000)
+            use_rand_idx_pe_for_non_reference_views (bool): Whether to use random index positional encoding for non-reference views. (default: True)
+            use_rand_idx_pe_for_non_reference_views (bool): Whether to use random index positional encoding for non-reference views.
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. (default: None) Options:
+            - None: Return all intermediate layers.
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. (default: True)
+            intermediates_only (bool, optional): Whether to return only the intermediate features. (default: False)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Init the base classes
+        MultiViewAlternatingAttentionTransformer.__init__(
+            self,
+            name=name,
+            input_embed_dim=input_embed_dim,
+            use_pe_for_non_reference_views=use_pe_for_non_reference_views,
+            max_num_views_for_pe=max_num_views_for_pe,
+            use_rand_idx_pe_for_non_reference_views=use_rand_idx_pe_for_non_reference_views,
+            size=size,
+            depth=depth,
+            dim=dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            proj_drop=proj_drop,
+            attn_drop=attn_drop,
+            init_values=init_values,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            mlp_layer=mlp_layer,
+            custom_positional_encoding=custom_positional_encoding,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            gradient_checkpointing=gradient_checkpointing,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            intermediates_only=intermediates_only,
+        )
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> Union[
+        List[MultiViewTransformerOutput],
+        Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]],
+    ]:
+        """
+        Forward interface for the Multi-View Alternating-Attention Transformer with Intermediate Feature Return.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+                Optionally, the input can also include additional_input_tokens (e.g., class token, registers, pose tokens, scale token)
+                which are appended to the token set from the multi-view features. The tokens are of size (batch, input_embed_dim, num_of_additional_tokens).
+        Returns:
+            Union[List[MultiViewTransformerOutput], Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]]]:
+                Output of the model post information sharing.
+                If intermediates_only is True, returns a list of intermediate outputs.
+                If intermediates_only is False, returns a tuple of final output and a list of intermediate outputs.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        if self.use_pe_for_non_reference_views:
+            assert (
+                len(model_input.features) <= self.max_num_views_for_pe
+            ), f"Expected less than {self.max_num_views_for_pe} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Get the indices of the intermediate features to return
+        intermediate_multi_view_features = []
+        take_indices, _ = feature_take_indices(self.depth, self.indices)
+        # Initialize the multi-view features from the model input and number of views for current input
+        multi_view_features = model_input.features
+        num_of_views = len(multi_view_features)
+        batch_size, _, height, width = multi_view_features[0].shape
+        num_of_tokens_per_view = height * width
+        # Stack the multi-view features (N, C, H, W) to (N, V, C, H, W) (assumes all V views have same shape)
+        multi_view_features = torch.stack(multi_view_features, dim=1)
+        # Resize the multi-view features from NVCHW to NLC, where L = V * H * W
+        multi_view_features = multi_view_features.permute(0, 1, 3, 4, 2)  # (N, V, H, W, C)
+        multi_view_features = multi_view_features.reshape(
+            batch_size, num_of_views * height * width, self.input_embed_dim
+        ).contiguous()
+        # Process additional input tokens if provided
+        if model_input.additional_input_tokens is not None:
+            additional_tokens = model_input.additional_input_tokens
+            assert additional_tokens.ndim == 3, "Additional tokens must have 3 dimensions (N, C, T)"
+            assert (
+                additional_tokens.shape[1] == self.input_embed_dim
+            ), f"Additional tokens must have input dimension {self.input_embed_dim}"
+            assert additional_tokens.shape[0] == batch_size, "Batch size mismatch for additional tokens"
+            # Reshape to channel-last format for transformer processing
+            additional_tokens = additional_tokens.permute(0, 2, 1).contiguous()  # (N, C, T) -> (N, T, C)
+            # Concatenate the additional tokens to the multi-view features
+            multi_view_features = torch.cat([multi_view_features, additional_tokens], dim=1)
+        # Project input features to the transformer dimension
+        multi_view_features = self.proj_embed(multi_view_features)
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, multi_view_features.device)
+            ] * num_of_views  # List of length V, where each tensor is (N, H * W, C)
+            multi_view_positions = torch.cat(multi_view_positions, dim=1)  # (N, V * H * W, C)
+        else:
+            multi_view_positions = [None] * num_of_views
+        # Add None positions for additional tokens if they exist
+        if model_input.additional_input_tokens is not None:
+            additional_tokens_positions = [None] * model_input.additional_input_tokens.shape[1]
+            multi_view_positions = multi_view_positions + additional_tokens_positions
+        # Add positional encoding for reference view (idx 0)
+        ref_view_pe = self.view_pos_table[0].clone().detach()
+        ref_view_pe = ref_view_pe.reshape((1, 1, self.dim))
+        ref_view_pe = ref_view_pe.repeat(batch_size, num_of_tokens_per_view, 1)
+        ref_view_features = multi_view_features[:, :num_of_tokens_per_view, :]
+        ref_view_features = ref_view_features + ref_view_pe
+        if self.use_pe_for_non_reference_views:
+            # Add positional encoding for non-reference views (sequential indices starting from idx 1 or random indices which are uniformly sampled)
+            if self.use_rand_idx_pe_for_non_reference_views:
+                non_ref_view_pe_indices = torch.randint(low=1, high=self.max_num_views_for_pe, size=(num_of_views - 1,))
+            else:
+                non_ref_view_pe_indices = torch.arange(1, num_of_views)
+            non_ref_view_pe = self.view_pos_table[non_ref_view_pe_indices].clone().detach()
+            non_ref_view_pe = non_ref_view_pe.reshape((1, num_of_views - 1, self.dim))
+            non_ref_view_pe = non_ref_view_pe.repeat_interleave(num_of_tokens_per_view, dim=1)
+            non_ref_view_pe = non_ref_view_pe.repeat(batch_size, 1, 1)
+            non_ref_view_features = multi_view_features[
+                :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+            ]
+            non_ref_view_features = non_ref_view_features + non_ref_view_pe
+        else:
+            non_ref_view_features = multi_view_features[
+                :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+            ]
+        # Concatenate the reference and non-reference view features
+        # Handle additional tokens (no view-based positional encoding for them)
+        if model_input.additional_input_tokens is not None:
+            additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features, additional_features], dim=1)
+        else:
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features], dim=1)
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            if depth_idx % 2 == 0:
+                # Apply the self-attention block and update the multi-view features
+                # Global attention across all views
+                multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+            else:
+                # Handle additional tokens separately for frame-level attention
+                additional_features = None
+                additional_positions = None
+                if model_input.additional_input_tokens is not None:
+                    # Extract additional token features
+                    additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+                    # Keep only view features for frame-level attention
+                    multi_view_features = multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+                    # Handle positions for additional tokens if custom positional encoding is used
+                    if self.custom_positional_encoding is not None:
+                        additional_positions = multi_view_positions[:, num_of_views * num_of_tokens_per_view :, :]
+                        multi_view_positions = multi_view_positions[:, : num_of_views * num_of_tokens_per_view, :]
+                # Reshape the multi-view features from (N, V * H * W, C) to (N * V, H * W, C)
+                multi_view_features = multi_view_features.reshape(
+                    batch_size * num_of_views, num_of_tokens_per_view, self.dim
+                ).contiguous()  # (N * V, H * W, C)
+                if multi_view_positions[0] is not None:
+                    multi_view_positions = multi_view_positions.reshape(
+                        batch_size * num_of_views, num_of_tokens_per_view, 2
+                    ).contiguous()  # (N * V, H * W, C)
+                # Apply the self-attention block and update the multi-view features
+                # Frame-level attention within each view
+                multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+                # Reshape the multi-view features from (N * V, H * W, C) back to (N, V * H * W, C)
+                multi_view_features = multi_view_features.reshape(
+                    batch_size, num_of_views * num_of_tokens_per_view, self.dim
+                ).contiguous()  # (N, V * H * W, C)
+                if multi_view_positions[0] is not None:
+                    multi_view_positions = multi_view_positions.reshape(
+                        batch_size, num_of_views * num_of_tokens_per_view, 2
+                    ).contiguous()  # (N, V * H * W, C)
+                # Reattach additional tokens if they exist
+                if additional_features is not None:
+                    multi_view_features = torch.cat([multi_view_features, additional_features], dim=1)
+                    # Reattach positions for additional tokens if they exist
+                    if additional_positions is not None:
+                        multi_view_positions = torch.cat([multi_view_positions, additional_positions], dim=1)
+            if depth_idx in take_indices:
+                # Normalize the intermediate features with final norm layer if enabled
+                intermediate_multi_view_features.append(
+                    self.norm(multi_view_features) if self.norm_intermediate else multi_view_features
+                )
+        # Reshape the intermediate features and convert to MultiViewTransformerOutput class
+        for idx in range(len(intermediate_multi_view_features)):
+            # Get the current intermediate features
+            current_features = intermediate_multi_view_features[idx]
+            # Extract additional token features if provided
+            additional_token_features = None
+            if model_input.additional_input_tokens is not None:
+                additional_token_features = current_features[:, num_of_views * num_of_tokens_per_view :, :]
+                additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+                # Only keep the view features for reshaping
+                current_features = current_features[:, : num_of_views * num_of_tokens_per_view, :]
+            # Reshape the intermediate multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+            current_features = current_features.reshape(
+                batch_size, num_of_views, height, width, self.dim
+            )  # (N, V, H, W, C)
+            current_features = current_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+            # Split the intermediate multi-view features into separate views
+            current_features = current_features.split(1, dim=1)
+            current_features = [
+                intermediate_view_features.squeeze(dim=1) for intermediate_view_features in current_features
+            ]
+            intermediate_multi_view_features[idx] = MultiViewTransformerOutput(
+                features=current_features, additional_token_features=additional_token_features
+            )
+        # Return only the intermediate features if enabled
+        if self.intermediates_only:
+            return intermediate_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = self.norm(multi_view_features)
+        # Extract view features (excluding additional tokens)
+        additional_token_features = None
+        if model_input.additional_input_tokens is not None:
+            additional_token_features = output_multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+            view_features = output_multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+        else:
+            view_features = output_multi_view_features
+        # Reshape the output multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+        view_features = view_features.reshape(batch_size, num_of_views, height, width, self.dim)  # (N, V, H, W, C)
+        view_features = view_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+        # Split the output multi-view features into separate views
+        view_features = view_features.split(1, dim=1)
+        view_features = [output_view_features.squeeze(dim=1) for output_view_features in view_features]
+        output_multi_view_features = MultiViewTransformerOutput(
+            features=view_features, additional_token_features=additional_token_features
+        )
+        return output_multi_view_features, intermediate_multi_view_features
+def dummy_positional_encoding(x, xpos):
+    "Dummy function for positional encoding of tokens"
+    x = x
+    xpos = xpos
+    return x
+def test_reshape_for_frame_attention():
+    "Test the reshape function for frame-level attention in the Alternating Attention Transformer"
+    batch_size = 2
+    num_of_views = 3
+    height = width = 2
+    dim = 4
+    num_of_tokens_per_view = height * width
+    # Create tensor with recognizable pattern
+    x = torch.zeros(batch_size, num_of_views * num_of_tokens_per_view, dim)
+    for b in range(batch_size):
+        for v in range(num_of_views):
+            for h in range(height):
+                for w in range(width):
+                    token_idx = v * num_of_tokens_per_view + h * width + w
+                    x[b, token_idx] = torch.tensor([b, v, h, w])
+    # Apply reshape
+    reshaped = x.reshape(batch_size * num_of_views, num_of_tokens_per_view, dim).contiguous()
+    # Verify shape
+    assert reshaped.shape == (batch_size * num_of_views, num_of_tokens_per_view, dim)
+    # Verify content (check a few values)
+    for b in range(batch_size):
+        for v in range(num_of_views):
+            for h in range(height):
+                for w in range(width):
+                    batch_view_idx = b * num_of_views + v
+                    token_idx = h * width + w
+                    expected = torch.tensor([b, v, h, w])
+                    assert torch.all(reshaped[batch_view_idx, token_idx] == expected)
+    # Verify reshape back works
+    back_to_original = reshaped.reshape(batch_size, num_of_views * num_of_tokens_per_view, dim)
+    assert torch.all(x == back_to_original)
+    print("Reshape test passed!")
+if __name__ == "__main__":
+    # Unit test the reshape logic used for frame-level attention
+    test_reshape_for_frame_attention()
+    # Init multi-view alternating-attention transformer with no custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewAlternatingAttentionTransformer with {num_views} views ...")
+        # No positional encoding for non-reference views
+        model = MultiViewAlternatingAttentionTransformer(
+            name="MV-AAT",
+            input_embed_dim=1024,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+        # Sequential idx based positional encoding
+        model = MultiViewAlternatingAttentionTransformer(
+            name="MV-AAT",
+            input_embed_dim=1024,
+            use_pe_for_non_reference_views=True,
+            max_num_views_for_pe=1000,
+            use_rand_idx_pe_for_non_reference_views=False,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+        # Random idx based positional encoding
+        model = MultiViewAlternatingAttentionTransformer(
+            name="MV-AAT",
+            input_embed_dim=1024,
+            use_pe_for_non_reference_views=True,
+            max_num_views_for_pe=1000,
+            use_rand_idx_pe_for_non_reference_views=True,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    # Init multi-view alternating-attention transformer with custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(
+            f"Testing MultiViewAlternatingAttentionTransformer with {num_views} views and custom positional encoding ..."
+        )
+        model = MultiViewAlternatingAttentionTransformer(
+            name="MV-AAT",
+            input_embed_dim=1024,
+            custom_positional_encoding=dummy_positional_encoding,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    print("All multi-view alternating-attention transformers initialized and tested successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests ...")
+    # Run the intermediate feature returner with last-n index
+    model_intermediate_feature_returner = MultiViewAlternatingAttentionTransformerIFR(
+        name="MV-AAT-IFR",
+        input_embed_dim=1024,
+        indices=6,  # Last 6 layers
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 6
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Run the intermediate feature returner with specific indices
+    model_intermediate_feature_returner = MultiViewAlternatingAttentionTransformerIFR(
+        name="MV-AAT-IFR",
+        input_embed_dim=1024,
+        indices=[0, 2, 4, 6],  # Specific indices
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 4
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Test the normalizing of intermediate features
+    model_intermediate_feature_returner = MultiViewAlternatingAttentionTransformerIFR(
+        name="MV-AAT-IFR",
+        input_embed_dim=1024,
+        indices=[-1],  # Last layer
+        norm_intermediate=False,  # Disable normalization
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert not torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be different."
+    model_intermediate_feature_returner = MultiViewAlternatingAttentionTransformerIFR(
+        name="MV-AAT-IFR",
+        input_embed_dim=1024,
+        indices=[-1],  # Last layer
+        norm_intermediate=True,
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be same."
+    print("All Intermediate Feature Returner Tests passed!")
+    # Test additonal input tokens for MultiViewAlternatingAttentionTransformer
+    print("Testing MultiViewAlternatingAttentionTransformer with additional input tokens ...")
+    model = MultiViewAlternatingAttentionTransformer(
+        name="MV-AAT",
+        input_embed_dim=1024,
+    )
+    num_views = 2
+    num_additional_tokens = 5
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+    additional_tokens = torch.rand(1, 1024, num_additional_tokens)
+    model_input = MultiViewTransformerInput(features=model_input, additional_input_tokens=additional_tokens)
+    model_output = model(model_input)
+    assert len(model_output.features) == num_views
+    assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    assert model_output.additional_token_features is not None
+    assert model_output.additional_token_features.shape == (1, model.dim, num_additional_tokens)
+    # Test additonal input tokens for MultiViewAlternatingAttentionTransformerIFR
+    print("Testing MultiViewAlternatingAttentionTransformerIFR with additional input tokens ...")
+    model_ifr = MultiViewAlternatingAttentionTransformerIFR(
+        name="MV-AAT-IFR",
+        input_embed_dim=1024,
+        indices=[0, 2, 4],
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+    additional_tokens = torch.rand(1, 1024, num_additional_tokens)
+    model_input = MultiViewTransformerInput(features=model_input, additional_input_tokens=additional_tokens)
+    output = model_ifr(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert output[0].additional_token_features is not None
+    assert output[0].additional_token_features.shape == (1, model_ifr.dim, num_additional_tokens)
+    assert len(output[1]) == 3
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert all(intermediate.additional_token_features is not None for intermediate in output[1])
+    assert all(
+        intermediate.additional_token_features.shape == (1, model_ifr.dim, num_additional_tokens)
+        for intermediate in output[1]
+    )
+    print("All tests using additional input tokens passed!")

UniCeption/uniception/models/info_sharing/base.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Base Information Sharing Class for UniCeption
+"""
+from dataclasses import dataclass
+from typing import List, Optional
+import torch.nn as nn
+from jaxtyping import Float
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
+@dataclass
+class InfoSharingInput:
+    pass
+@dataclass
+class InfoSharingOutput:
+    pass
+class UniCeptionInfoSharingBase(nn.Module):
+    "Information Sharing Base Class for UniCeption"
+    def __init__(
+        self,
+        name: str,
+        size: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Base class for all models in UniCeption.
+        """
+        super().__init__(*args, **kwargs)
+        self.name: str = name
+        self.size: Optional[str] = size
+    def forward(
+        self,
+        model_input: InfoSharingInput,
+    ) -> InfoSharingOutput:
+        """
+        Forward interface for the UniCeption information sharing models.
+        Args:
+            model_input (InfoSharingInput): Input to the model.
+                This is also includes the other fields that are required by the specific implementation of the model.
+        Returns:
+            InfoSharingOutput: Output of the model.
+        """
+        raise NotImplementedError
+    def wrap_module_with_gradient_checkpointing(self, module: nn.Module):
+        """
+        Wrapper for Gradient Checkpointing
+        """
+        class _CheckpointingWrapper(module.__class__):
+            _restore_cls = module.__class__
+            def forward(self, *args, **kwargs):
+                return checkpoint(super().forward, *args, use_reentrant=False, **kwargs)
+        module.__class__ = _CheckpointingWrapper
+        return module
+@dataclass
+class MultiViewTransformerInput(InfoSharingInput):
+    """
+    Input class for Multi-View Transformer.
+    """
+    features: List[Float[Tensor, "batch input_embed_dim feat_height feat_width"]]
+    additional_input_tokens: Optional[Float[Tensor, "batch input_embed_dim num_additional_tokens"]] = None
+@dataclass
+class MultiViewTransformerOutput(InfoSharingOutput):
+    """
+    Output class for Multi-View Transformer.
+    """
+    features: List[Float[Tensor, "batch transformer_embed_dim feat_height feat_width"]]
+    additional_token_features: Optional[Float[Tensor, "batch transformer_embed_dim num_additional_tokens"]] = None
+@dataclass
+class MultiSetTransformerInput(InfoSharingInput):
+    """
+    Input class for Multi-Set Transformer.
+    """
+    features: List[Float[Tensor, "batch input_embed_dim num_tokens"]]
+    additional_input_tokens: Optional[Float[Tensor, "batch input_embed_dim num_additional_tokens"]] = None
+@dataclass
+class MultiSetTransformerOutput(InfoSharingOutput):
+    """
+    Output class for Multi-Set Transformer.
+    """
+    features: List[Float[Tensor, "batch transformer_embed_dim num_tokens"]]
+    additional_token_features: Optional[Float[Tensor, "batch transformer_embed_dim num_additional_tokens"]] = None
+if __name__ == "__main__":
+    dummy_model = UniCeptionInfoSharingBase(name="dummy")
+    print("Dummy Base InfoSharing model created successfully!")

UniCeption/uniception/models/info_sharing/cross_attention_transformer.py ADDED Viewed

	@@ -0,0 +1,582 @@

+"""
+UniCeption Cross-Attention Transformer for Information Sharing
+"""
+from copy import deepcopy
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type, Union
+import torch
+import torch.nn as nn
+from uniception.models.info_sharing.base import (
+    MultiViewTransformerInput,
+    MultiViewTransformerOutput,
+    UniCeptionInfoSharingBase,
+)
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner, feature_take_indices
+from uniception.models.utils.positional_encoding import PositionGetter
+from uniception.models.utils.transformer_blocks import CrossAttentionBlock, Mlp
+class MultiViewCrossAttentionTransformer(UniCeptionInfoSharingBase):
+    "UniCeption Multi-View Cross-Attention Transformer for information sharing across image features from different views."
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        num_views: int,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: Type[nn.Module] = Mlp,
+        custom_positional_encoding: Optional[Callable] = None,
+        norm_cross_tokens: bool = True,
+        pretrained_checkpoint_path: Optional[str] = None,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Cross-Attention Transformer for information sharing across image features from different views.
+        Creates a cross-attention transformer with multiple branches for each view.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            num_views (int): Number of views (input feature sets).
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: True)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            norm_cross_tokens (bool): Whether to normalize cross tokens (default: True)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Initialize the base class
+        super().__init__(name=name, size=size, *args, **kwargs)
+        # Initialize the specific attributes of the transformer
+        self.input_embed_dim = input_embed_dim
+        self.num_views = num_views
+        self.depth = depth
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.proj_drop = proj_drop
+        self.attn_drop = attn_drop
+        self.init_values = init_values
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.mlp_layer = mlp_layer
+        self.custom_positional_encoding = custom_positional_encoding
+        self.norm_cross_tokens = norm_cross_tokens
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.gradient_checkpointing = gradient_checkpointing
+        # Initialize the projection layer for input embeddings
+        if self.input_embed_dim != self.dim:
+            self.proj_embed = nn.Linear(self.input_embed_dim, self.dim, bias=True)
+        else:
+            self.proj_embed = nn.Identity()
+        # Initialize the cross-attention blocks for a single view
+        cross_attention_blocks = nn.ModuleList(
+            [
+                CrossAttentionBlock(
+                    dim=self.dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    qk_norm=self.qk_norm,
+                    proj_drop=self.proj_drop,
+                    attn_drop=self.attn_drop,
+                    init_values=self.init_values,
+                    drop_path=self.drop_path,
+                    act_layer=self.act_layer,
+                    norm_layer=self.norm_layer,
+                    mlp_layer=self.mlp_layer,
+                    custom_positional_encoding=self.custom_positional_encoding,
+                    norm_cross_tokens=self.norm_cross_tokens,
+                )
+                for _ in range(self.depth)
+            ]
+        )
+        # Copy the cross-attention blocks for all other views
+        self.multi_view_branches = nn.ModuleList([cross_attention_blocks])
+        for _ in range(1, self.num_views):
+            self.multi_view_branches.append(deepcopy(cross_attention_blocks))
+        # Initialize the final normalization layer
+        self.norm = self.norm_layer(self.dim)
+        # Initialize the position getter for patch positions if required
+        if self.custom_positional_encoding is not None:
+            self.position_getter = PositionGetter()
+        # Initialize random weights
+        self.initialize_weights()
+        # Apply gradient checkpointing if enabled
+        if self.gradient_checkpointing:
+            for i, block in enumerate(self.cross_attention_blocks):
+                self.cross_attention_blocks[i] = self.wrap_module_with_gradient_checkpointing(block)
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(
+                f"Loading pretrained multi-view cross-attention transformer weights from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def initialize_weights(self):
+        "Initialize weights of the transformer."
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer linear and layer norm weights."
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> MultiViewTransformerOutput:
+        """
+        Forward interface for the Multi-View Cross-Attention Transformer.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+        Returns:
+            MultiViewTransformerOutput: Output of the model post information sharing.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) == self.num_views
+        ), f"Expected {self.num_views} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Initialize the multi-view features from the model input
+        multi_view_features = model_input.features
+        # Resize the multi-view features from NCHW to NLC
+        batch_size, _, height, width = multi_view_features[0].shape
+        multi_view_features = [
+            view_features.permute(0, 2, 3, 1).reshape(batch_size, height * width, self.input_embed_dim).contiguous()
+            for view_features in multi_view_features
+        ]
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, view_features.device)
+                for view_features in multi_view_features
+            ]
+        else:
+            multi_view_positions = [None] * self.num_views
+        # Project input features to the transformer dimension
+        multi_view_features = [self.proj_embed(view_features) for view_features in multi_view_features]
+        # Pass through each view's cross-attention blocks
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            updated_multi_view_features = []
+            # Loop over each view
+            for view_idx, view_features in enumerate(multi_view_features):
+                # Get all the other views
+                other_views_features = [multi_view_features[i] for i in range(self.num_views) if i != view_idx]
+                # Concatenate all the tokens from the other views
+                other_views_features = torch.cat(other_views_features, dim=1)
+                # Get the positions for the current view
+                view_positions = multi_view_positions[view_idx]
+                # Get the positions for all other views
+                other_views_positions = (
+                    torch.cat([multi_view_positions[i] for i in range(self.num_views) if i != view_idx], dim=1)
+                    if view_positions is not None
+                    else None
+                )
+                # Apply the cross-attention block and update the multi-view features
+                updated_view_features = self.multi_view_branches[view_idx][depth_idx](
+                    view_features, other_views_features, view_positions, other_views_positions
+                )
+                # Keep track of the updated view features
+                updated_multi_view_features.append(updated_view_features)
+            # Update the multi-view features for the next depth
+            multi_view_features = updated_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = [self.norm(view_features) for view_features in multi_view_features]
+        # Resize the output multi-view features back to NCHW
+        output_multi_view_features = [
+            view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+            for view_features in output_multi_view_features
+        ]
+        return MultiViewTransformerOutput(features=output_multi_view_features)
+class MultiViewCrossAttentionTransformerIFR(MultiViewCrossAttentionTransformer, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption Multi-View Cross-Attention Transformer"
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        num_views: int,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: nn.Module = Mlp,
+        custom_positional_encoding: Callable = None,
+        norm_cross_tokens: bool = True,
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = None,
+        norm_intermediate: bool = True,
+        intermediates_only: bool = False,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Cross-Attention Transformer for information sharing across image features from different views.
+        Creates a cross-attention transformer with multiple branches for each view.
+        Extends the base class to return intermediate features.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            num_views (int): Number of views (input feature sets).
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: True)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            norm_cross_tokens (bool): Whether to normalize cross tokens (default: True)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. (default: None) Options:
+            - None: Return all intermediate layers.
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. (default: True)
+            intermediates_only (bool, optional): Whether to return only the intermediate features. (default: False)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Init the base classes
+        MultiViewCrossAttentionTransformer.__init__(
+            self,
+            name=name,
+            input_embed_dim=input_embed_dim,
+            num_views=num_views,
+            size=size,
+            depth=depth,
+            dim=dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            proj_drop=proj_drop,
+            attn_drop=attn_drop,
+            init_values=init_values,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            mlp_layer=mlp_layer,
+            custom_positional_encoding=custom_positional_encoding,
+            norm_cross_tokens=norm_cross_tokens,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            gradient_checkpointing=gradient_checkpointing,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            intermediates_only=intermediates_only,
+        )
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> Union[
+        List[MultiViewTransformerOutput],
+        Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]],
+    ]:
+        """
+        Forward interface for the Multi-View Cross-Attention Transformer with Intermediate Feature Return.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+        Returns:
+            Union[List[MultiViewTransformerOutput], Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]]]:
+                Output of the model post information sharing.
+                If intermediates_only is True, returns a list of intermediate outputs.
+                If intermediates_only is False, returns a tuple of final output and a list of intermediate outputs.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) == self.num_views
+        ), f"Expected {self.num_views} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Get the indices of the intermediate features to return
+        intermediate_multi_view_features = []
+        take_indices, _ = feature_take_indices(self.depth, self.indices)
+        # Initialize the multi-view features from the model input
+        multi_view_features = model_input.features
+        # Resize the multi-view features from NCHW to NLC
+        batch_size, _, height, width = multi_view_features[0].shape
+        multi_view_features = [
+            view_features.permute(0, 2, 3, 1).reshape(batch_size, height * width, self.input_embed_dim).contiguous()
+            for view_features in multi_view_features
+        ]
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, view_features.device)
+                for view_features in multi_view_features
+            ]
+        else:
+            multi_view_positions = [None] * self.num_views
+        # Project input features to the transformer dimension
+        multi_view_features = [self.proj_embed(view_features) for view_features in multi_view_features]
+        # Pass through each view's cross-attention blocks
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            updated_multi_view_features = []
+            # Loop over each view
+            for view_idx, view_features in enumerate(multi_view_features):
+                # Get all the other views
+                other_views_features = [multi_view_features[i] for i in range(self.num_views) if i != view_idx]
+                # Concatenate all the tokens from the other views
+                other_views_features = torch.cat(other_views_features, dim=1)
+                # Get the positions for the current view
+                view_positions = multi_view_positions[view_idx]
+                # Get the positions for all other views
+                other_views_positions = (
+                    torch.cat([multi_view_positions[i] for i in range(self.num_views) if i != view_idx], dim=1)
+                    if view_positions is not None
+                    else None
+                )
+                # Apply the cross-attention block and update the multi-view features
+                updated_view_features = self.multi_view_branches[view_idx][depth_idx](
+                    view_features, other_views_features, view_positions, other_views_positions
+                )
+                # Keep track of the updated view features
+                updated_multi_view_features.append(updated_view_features)
+            # Update the multi-view features for the next depth
+            multi_view_features = updated_multi_view_features
+            # Append the intermediate features if required
+            if depth_idx in take_indices:
+                # Normalize the intermediate features with final norm layer if enabled
+                intermediate_multi_view_features.append(
+                    [self.norm(view_features) for view_features in multi_view_features]
+                    if self.norm_intermediate
+                    else multi_view_features
+                )
+        # Reshape the intermediate features and convert to MultiViewTransformerOutput class
+        for idx in range(len(intermediate_multi_view_features)):
+            intermediate_multi_view_features[idx] = [
+                view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+                for view_features in intermediate_multi_view_features[idx]
+            ]
+            intermediate_multi_view_features[idx] = MultiViewTransformerOutput(
+                features=intermediate_multi_view_features[idx]
+            )
+        # Return only the intermediate features if enabled
+        if self.intermediates_only:
+            return intermediate_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = [self.norm(view_features) for view_features in multi_view_features]
+        # Resize the output multi-view features back to NCHW
+        output_multi_view_features = [
+            view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+            for view_features in output_multi_view_features
+        ]
+        output_multi_view_features = MultiViewTransformerOutput(features=output_multi_view_features)
+        return output_multi_view_features, intermediate_multi_view_features
+def dummy_positional_encoding(x, xpos):
+    "Dummy function for positional encoding of tokens"
+    x = x
+    xpos = xpos
+    return x
+if __name__ == "__main__":
+    # Init multi-view cross-attention transformer with no custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewCrossAttentionTransformer with {num_views} views ...")
+        model = MultiViewCrossAttentionTransformer(name="MV-CAT", input_embed_dim=1024, num_views=num_views)
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    # Init multi-view cross-attention transformer with custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewCrossAttentionTransformer with {num_views} views and custom positional encoding ...")
+        model = MultiViewCrossAttentionTransformer(
+            name="MV-CAT",
+            input_embed_dim=1024,
+            num_views=num_views,
+            custom_positional_encoding=dummy_positional_encoding,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    print("All multi-view cross-attention transformers initialized and tested successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests ...")
+    # Run the intermediate feature returner with last-n index
+    model_intermediate_feature_returner = MultiViewCrossAttentionTransformerIFR(
+        name="MV-CAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=6,  # Last 6 layers
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 6
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Run the intermediate feature returner with specific indices
+    model_intermediate_feature_returner = MultiViewCrossAttentionTransformerIFR(
+        name="MV-CAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[0, 2, 4, 6],  # Specific indices
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 4
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Test the normalizing of intermediate features
+    model_intermediate_feature_returner = MultiViewCrossAttentionTransformerIFR(
+        name="MV-CAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[-1],  # Last layer
+        norm_intermediate=False,  # Disable normalization
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert not torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be different."
+    model_intermediate_feature_returner = MultiViewCrossAttentionTransformerIFR(
+        name="MV-CAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[-1],  # Last layer
+        norm_intermediate=True,
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be same."
+    print("All Intermediate Feature Returner Tests passed!")

UniCeption/uniception/models/info_sharing/diff_cross_attention_transformer.py ADDED Viewed

	@@ -0,0 +1,588 @@

+"""
+UniCeption Cross-Attention Transformer for Information Sharing
+"""
+from copy import deepcopy
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type, Union
+import torch
+import torch.nn as nn
+from uniception.models.info_sharing.base import UniCeptionInfoSharingBase
+from uniception.models.info_sharing.cross_attention_transformer import (
+    MultiViewTransformerInput,
+    MultiViewTransformerOutput,
+    PositionGetter,
+)
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner, feature_take_indices
+from uniception.models.utils.transformer_blocks import DiffCrossAttentionBlock, Mlp
+class DifferentialMultiViewCrossAttentionTransformer(UniCeptionInfoSharingBase):
+    "UniCeption Multi-View Cross-Attention Transformer for information sharing across image features from different views."
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        num_views: int,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: Type[nn.Module] = Mlp,
+        custom_positional_encoding: Optional[Callable] = None,
+        norm_cross_tokens: bool = True,
+        pretrained_checkpoint_path: Optional[str] = None,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Cross-Attention Transformer for information sharing across image features from different views.
+        Creates a cross-attention transformer with multiple branches for each view.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            num_views (int): Number of views (input feature sets).
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            norm_cross_tokens (bool): Whether to normalize cross tokens (default: True)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Initialize the base class
+        super().__init__(name=name, size=size, *args, **kwargs)
+        # Initialize the specific attributes of the transformer
+        self.input_embed_dim = input_embed_dim
+        self.num_views = num_views
+        self.depth = depth
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.proj_drop = proj_drop
+        self.attn_drop = attn_drop
+        self.init_values = init_values
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.mlp_layer = mlp_layer
+        self.custom_positional_encoding = custom_positional_encoding
+        self.norm_cross_tokens = norm_cross_tokens
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.gradient_checkpointing = gradient_checkpointing
+        # Initialize the projection layer for input embeddings
+        if self.input_embed_dim != self.dim:
+            self.proj_embed = nn.Linear(self.input_embed_dim, self.dim, bias=True)
+        else:
+            self.proj_embed = nn.Identity()
+        # Initialize the cross-attention blocks for a single view
+        assert num_heads % 2 == 0, "Number of heads must be divisible by 2 for differential cross-attention."
+        cross_attention_blocks = nn.ModuleList(
+            [
+                DiffCrossAttentionBlock(
+                    depth=i,
+                    dim=self.dim,
+                    num_heads=self.num_heads // 2,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    qk_norm=self.qk_norm,
+                    proj_drop=self.proj_drop,
+                    attn_drop=self.attn_drop,
+                    init_values=self.init_values,
+                    drop_path=self.drop_path,
+                    act_layer=self.act_layer,
+                    norm_layer=self.norm_layer,
+                    mlp_layer=self.mlp_layer,
+                    custom_positional_encoding=self.custom_positional_encoding,
+                    norm_cross_tokens=self.norm_cross_tokens,
+                )
+                for i in range(self.depth)
+            ]
+        )
+        # Copy the cross-attention blocks for all other views
+        self.multi_view_branches = nn.ModuleList([cross_attention_blocks])
+        for _ in range(1, self.num_views):
+            self.multi_view_branches.append(deepcopy(cross_attention_blocks))
+        # Initialize the final normalization layer
+        self.norm = self.norm_layer(self.dim)
+        # Initialize the position getter for patch positions if required
+        if self.custom_positional_encoding is not None:
+            self.position_getter = PositionGetter()
+        # Initialize random weights
+        self.initialize_weights()
+        # Apply gradient checkpointing if enabled
+        if self.gradient_checkpointing:
+            for i, block in enumerate(self.cross_attention_blocks):
+                self.cross_attention_blocks[i] = self.wrap_module_with_gradient_checkpointing(block)
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(
+                f"Loading pretrained multi-view cross-attention transformer weights from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+    def initialize_weights(self):
+        "Initialize weights of the transformer."
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer linear and layer norm weights."
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> MultiViewTransformerOutput:
+        """
+        Forward interface for the Multi-View Cross-Attention Transformer.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+        Returns:
+            MultiViewTransformerOutput: Output of the model post information sharing.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) == self.num_views
+        ), f"Expected {self.num_views} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Initialize the multi-view features from the model input
+        multi_view_features = model_input.features
+        # Resize the multi-view features from NCHW to NLC
+        batch_size, _, height, width = multi_view_features[0].shape
+        multi_view_features = [
+            view_features.permute(0, 2, 3, 1).reshape(batch_size, height * width, self.input_embed_dim).contiguous()
+            for view_features in multi_view_features
+        ]
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, view_features.device)
+                for view_features in multi_view_features
+            ]
+        else:
+            multi_view_positions = [None] * self.num_views
+        # Project input features to the transformer dimension
+        multi_view_features = [self.proj_embed(view_features) for view_features in multi_view_features]
+        # Pass through each view's cross-attention blocks
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            updated_multi_view_features = []
+            # Loop over each view
+            for view_idx, view_features in enumerate(multi_view_features):
+                # Get all the other views
+                other_views_features = [multi_view_features[i] for i in range(self.num_views) if i != view_idx]
+                # Concatenate all the tokens from the other views
+                other_views_features = torch.cat(other_views_features, dim=1)
+                # Get the positions for the current view
+                view_positions = multi_view_positions[view_idx]
+                # Get the positions for all other views
+                other_views_positions = (
+                    torch.cat([multi_view_positions[i] for i in range(self.num_views) if i != view_idx], dim=1)
+                    if view_positions is not None
+                    else None
+                )
+                # Apply the cross-attention block and update the multi-view features
+                updated_view_features = self.multi_view_branches[view_idx][depth_idx](
+                    view_features, other_views_features, view_positions, other_views_positions
+                )
+                # Keep track of the updated view features
+                updated_multi_view_features.append(updated_view_features)
+            # Update the multi-view features for the next depth
+            multi_view_features = updated_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = [self.norm(view_features) for view_features in multi_view_features]
+        # Resize the output multi-view features back to NCHW
+        output_multi_view_features = [
+            view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+            for view_features in output_multi_view_features
+        ]
+        return MultiViewTransformerOutput(features=output_multi_view_features)
+class DifferentialMultiViewCrossAttentionTransformerIFR(
+    DifferentialMultiViewCrossAttentionTransformer, IntermediateFeatureReturner
+):
+    "Intermediate Feature Returner for UniCeption Multi-View Cross-Attention Transformer"
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        num_views: int,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: nn.Module = Mlp,
+        custom_positional_encoding: Callable = None,
+        norm_cross_tokens: bool = True,
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = None,
+        norm_intermediate: bool = True,
+        intermediates_only: bool = False,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Cross-Attention Transformer for information sharing across image features from different views.
+        Creates a cross-attention transformer with multiple branches for each view.
+        Extends the base class to return intermediate features.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            num_views (int): Number of views (input feature sets).
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            norm_cross_tokens (bool): Whether to normalize cross tokens (default: True)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. (default: None) Options:
+            - None: Return all intermediate layers.
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. (default: True)
+            intermediates_only (bool, optional): Whether to return only the intermediate features. (default: False)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Init the base classes
+        DifferentialMultiViewCrossAttentionTransformer.__init__(
+            self,
+            name=name,
+            input_embed_dim=input_embed_dim,
+            num_views=num_views,
+            size=size,
+            depth=depth,
+            dim=dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            proj_drop=proj_drop,
+            attn_drop=attn_drop,
+            init_values=init_values,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            mlp_layer=mlp_layer,
+            custom_positional_encoding=custom_positional_encoding,
+            norm_cross_tokens=norm_cross_tokens,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            gradient_checkpointing=gradient_checkpointing,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            intermediates_only=intermediates_only,
+        )
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> Union[
+        List[MultiViewTransformerOutput],
+        Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]],
+    ]:
+        """
+        Forward interface for the Multi-View Cross-Attention Transformer with Intermediate Feature Return.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+        Returns:
+            Union[List[MultiViewTransformerOutput], Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]]]:
+                Output of the model post information sharing.
+                If intermediates_only is True, returns a list of intermediate outputs.
+                If intermediates_only is False, returns a tuple of final output and a list of intermediate outputs.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) == self.num_views
+        ), f"Expected {self.num_views} views, got {len(model_input.features)}"
+        assert all(
+            view_features.shape[1] == self.input_embed_dim for view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            view_features.ndim == 4 for view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Get the indices of the intermediate features to return
+        intermediate_multi_view_features = []
+        take_indices, _ = feature_take_indices(self.depth, self.indices)
+        # Initialize the multi-view features from the model input
+        multi_view_features = model_input.features
+        # Resize the multi-view features from NCHW to NLC
+        batch_size, _, height, width = multi_view_features[0].shape
+        multi_view_features = [
+            view_features.permute(0, 2, 3, 1).reshape(batch_size, height * width, self.input_embed_dim).contiguous()
+            for view_features in multi_view_features
+        ]
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, view_features.device)
+                for view_features in multi_view_features
+            ]
+        else:
+            multi_view_positions = [None] * self.num_views
+        # Project input features to the transformer dimension
+        multi_view_features = [self.proj_embed(view_features) for view_features in multi_view_features]
+        # Pass through each view's cross-attention blocks
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            updated_multi_view_features = []
+            # Loop over each view
+            for view_idx, view_features in enumerate(multi_view_features):
+                # Get all the other views
+                other_views_features = [multi_view_features[i] for i in range(self.num_views) if i != view_idx]
+                # Concatenate all the tokens from the other views
+                other_views_features = torch.cat(other_views_features, dim=1)
+                # Get the positions for the current view
+                view_positions = multi_view_positions[view_idx]
+                # Get the positions for all other views
+                other_views_positions = (
+                    torch.cat([multi_view_positions[i] for i in range(self.num_views) if i != view_idx], dim=1)
+                    if view_positions is not None
+                    else None
+                )
+                # Apply the cross-attention block and update the multi-view features
+                updated_view_features = self.multi_view_branches[view_idx][depth_idx](
+                    view_features, other_views_features, view_positions, other_views_positions
+                )
+                # Keep track of the updated view features
+                updated_multi_view_features.append(updated_view_features)
+            # Update the multi-view features for the next depth
+            multi_view_features = updated_multi_view_features
+            # Append the intermediate features if required
+            if depth_idx in take_indices:
+                # Normalize the intermediate features with final norm layer if enabled
+                intermediate_multi_view_features.append(
+                    [self.norm(view_features) for view_features in multi_view_features]
+                    if self.norm_intermediate
+                    else multi_view_features
+                )
+        # Reshape the intermediate features and convert to MultiViewTransformerOutput class
+        for idx in range(len(intermediate_multi_view_features)):
+            intermediate_multi_view_features[idx] = [
+                view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+                for view_features in intermediate_multi_view_features[idx]
+            ]
+            intermediate_multi_view_features[idx] = MultiViewTransformerOutput(
+                features=intermediate_multi_view_features[idx]
+            )
+        # Return only the intermediate features if enabled
+        if self.intermediates_only:
+            return intermediate_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = [self.norm(view_features) for view_features in multi_view_features]
+        # Resize the output multi-view features back to NCHW
+        output_multi_view_features = [
+            view_features.reshape(batch_size, height, width, self.dim).permute(0, 3, 1, 2).contiguous()
+            for view_features in output_multi_view_features
+        ]
+        output_multi_view_features = MultiViewTransformerOutput(features=output_multi_view_features)
+        return output_multi_view_features, intermediate_multi_view_features
+def dummy_positional_encoding(x, xpos):
+    "Dummy function for positional encoding of tokens"
+    x = x
+    xpos = xpos
+    return x
+if __name__ == "__main__":
+    # Init multi-view cross-attention transformer with no custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewCrossAttentionTransformer with {num_views} views ...")
+        model = DifferentialMultiViewCrossAttentionTransformer(
+            name="MV-DCAT", input_embed_dim=1024, num_views=num_views
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    # Init multi-view cross-attention transformer with custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(
+            f"Testing Differential MultiViewCrossAttentionTransformer with {num_views} views and custom positional encoding ..."
+        )
+        model = DifferentialMultiViewCrossAttentionTransformer(
+            name="MV-DCAT",
+            input_embed_dim=1024,
+            num_views=num_views,
+            custom_positional_encoding=dummy_positional_encoding,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    print("All multi-view cross-attention transformers initialized and tested successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests ...")
+    # Run the intermediate feature returner with last-n index
+    model_intermediate_feature_returner = DifferentialMultiViewCrossAttentionTransformerIFR(
+        name="MV-DCAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=6,  # Last 6 layers
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 6
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Run the intermediate feature returner with specific indices
+    model_intermediate_feature_returner = DifferentialMultiViewCrossAttentionTransformerIFR(
+        name="MV-DCAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[0, 2, 4, 6],  # Specific indices
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 4
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Test the normalizing of intermediate features
+    model_intermediate_feature_returner = DifferentialMultiViewCrossAttentionTransformerIFR(
+        name="MV-DCAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[-1],  # Last layer
+        norm_intermediate=False,  # Disable normalization
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert not torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be different."
+    model_intermediate_feature_returner = DifferentialMultiViewCrossAttentionTransformerIFR(
+        name="MV-DCAT-IFR",
+        input_embed_dim=1024,
+        num_views=2,
+        indices=[-1],  # Last layer
+        norm_intermediate=True,
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be same."
+    print("All Intermediate Feature Returner Tests passed!")

UniCeption/uniception/models/info_sharing/global_attention_transformer.py ADDED Viewed

	@@ -0,0 +1,1107 @@

+"""
+UniCeption Global-Attention Transformer for Information Sharing
+"""
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from uniception.models.info_sharing.base import (
+    MultiSetTransformerInput,
+    MultiSetTransformerOutput,
+    MultiViewTransformerInput,
+    MultiViewTransformerOutput,
+    UniCeptionInfoSharingBase,
+)
+from uniception.models.libs.croco.pos_embed import RoPE2D
+from uniception.models.utils.intermediate_feature_return import IntermediateFeatureReturner, feature_take_indices
+from uniception.models.utils.positional_encoding import PositionGetter
+from uniception.models.utils.transformer_blocks import Mlp, SelfAttentionBlock
+class MultiViewGlobalAttentionTransformer(UniCeptionInfoSharingBase):
+    "UniCeption Multi-View Global-Attention Transformer for information sharing across image features from different views."
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        max_num_views: int,
+        use_rand_idx_pe_for_non_reference_views: bool,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: Type[nn.Module] = Mlp,
+        custom_positional_encoding: Optional[Union[str, Callable]] = None,
+        pretrained_checkpoint_path: Optional[str] = None,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Global-Attention Transformer for information sharing across image features from different views.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            max_num_views (int): Maximum number of views for positional encoding.
+            use_rand_idx_pe_for_non_reference_views (bool): Whether to use random index positional encoding for non-reference views.
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: True)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Initialize the base class
+        super().__init__(name=name, size=size, *args, **kwargs)
+        # Initialize the specific attributes of the transformer
+        self.input_embed_dim = input_embed_dim
+        self.max_num_views = max_num_views
+        self.use_rand_idx_pe_for_non_reference_views = use_rand_idx_pe_for_non_reference_views
+        self.depth = depth
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.proj_drop = proj_drop
+        self.attn_drop = attn_drop
+        self.init_values = init_values
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.mlp_layer = mlp_layer
+        self.custom_positional_encoding = custom_positional_encoding
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.gradient_checkpointing = gradient_checkpointing
+        # Initialize the projection layer for input embeddings
+        if self.input_embed_dim != self.dim:
+            self.proj_embed = nn.Linear(self.input_embed_dim, self.dim, bias=True)
+        else:
+            self.proj_embed = nn.Identity()
+        # Initialize custom position encodings
+        if self.custom_positional_encoding is not None and isinstance(self.custom_positional_encoding, str):
+            if self.custom_positional_encoding == "rope":
+                self.rope = RoPE2D(freq=100.0, F0=1.0)
+                self.custom_positional_encoding = self.rope
+            else:
+                raise ValueError(f"Unknown custom positional encoding: {self.custom_positional_encoding}")
+        # Initialize the self-attention blocks which ingest all views at once
+        self.self_attention_blocks = nn.ModuleList(
+            [
+                SelfAttentionBlock(
+                    dim=self.dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    qk_norm=self.qk_norm,
+                    proj_drop=self.proj_drop,
+                    attn_drop=self.attn_drop,
+                    init_values=self.init_values,
+                    drop_path=self.drop_path,
+                    act_layer=self.act_layer,
+                    norm_layer=self.norm_layer,
+                    mlp_layer=self.mlp_layer,
+                    custom_positional_encoding=self.custom_positional_encoding,
+                )
+                for _ in range(self.depth)
+            ]
+        )
+        # Initialize the final normalization layer
+        self.norm = self.norm_layer(self.dim)
+        # Initialize the position getter for patch positions if required
+        if self.custom_positional_encoding is not None:
+            self.position_getter = PositionGetter()
+        # Initialize the positional encoding table for the different views
+        self.register_buffer(
+            "view_pos_table",
+            self._get_sinusoid_encoding_table(self.max_num_views, self.dim, 10000),
+        )
+        # Initialize random weights
+        self.initialize_weights()
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(
+                f"Loading pretrained multi-view global-attention transformer weights from {self.pretrained_checkpoint_path} ..."
+            )
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+        # Apply gradient checkpointing if enabled
+        if self.gradient_checkpointing:
+            for i, block in enumerate(self.self_attention_blocks):
+                self.self_attention_blocks[i] = self.wrap_module_with_gradient_checkpointing(block)
+    def _get_sinusoid_encoding_table(self, n_position, d_hid, base):
+        "Sinusoid position encoding table"
+        def get_position_angle_vec(position):
+            return [position / np.power(base, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
+        return torch.FloatTensor(sinusoid_table)
+    def initialize_weights(self):
+        "Initialize weights of the transformer."
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer linear and layer norm weights."
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> MultiViewTransformerOutput:
+        """
+        Forward interface for the Multi-View Global-Attention Transformer.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+                Optionally, the input can also include additional_input_tokens (e.g., class token, registers, pose tokens, scale token)
+                which are appended to the token set from the multi-view features. The tokens are of size (batch, input_embed_dim, num_of_additional_tokens).
+        Returns:
+            MultiViewTransformerOutput: Output of the model post information sharing.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) <= self.max_num_views
+        ), f"Expected less than {self.max_num_views} views, got {len(model_input.features)}"
+        assert all(
+            curr_view_features.shape[1] == self.input_embed_dim for curr_view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            curr_view_features.ndim == 4 for curr_view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Initialize the multi-view features from the model input and number of views for current input
+        multi_view_features = model_input.features
+        num_of_views = len(multi_view_features)
+        batch_size, _, height, width = multi_view_features[0].shape
+        num_of_tokens_per_view = height * width
+        # Stack the multi-view features (N, C, H, W) to (N, V, C, H, W) (assumes all V views have same shape)
+        multi_view_features = torch.stack(multi_view_features, dim=1)
+        # Resize the multi-view features from NVCHW to NLC, where L = V * H * W
+        multi_view_features = multi_view_features.permute(0, 1, 3, 4, 2)  # (N, V, H, W, C)
+        multi_view_features = multi_view_features.reshape(
+            batch_size, num_of_views * height * width, self.input_embed_dim
+        ).contiguous()
+        # Process additional input tokens if provided
+        if model_input.additional_input_tokens is not None:
+            additional_tokens = model_input.additional_input_tokens
+            assert additional_tokens.ndim == 3, "Additional tokens must have 3 dimensions (N, C, T)"
+            assert (
+                additional_tokens.shape[1] == self.input_embed_dim
+            ), f"Additional tokens must have input dimension {self.input_embed_dim}"
+            assert additional_tokens.shape[0] == batch_size, "Batch size mismatch for additional tokens"
+            # Reshape to channel-last format for transformer processing
+            additional_tokens = additional_tokens.permute(0, 2, 1).contiguous()  # (N, C, T) -> (N, T, C)
+            # Concatenate the additional tokens to the multi-view features
+            multi_view_features = torch.cat([multi_view_features, additional_tokens], dim=1)
+        # Project input features to the transformer dimension
+        multi_view_features = self.proj_embed(multi_view_features)
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, multi_view_features.device)
+            ] * num_of_views  # List of length V, where each tensor is (N, H * W, C)
+            multi_view_positions = torch.cat(multi_view_positions, dim=1)  # (N, V * H * W, C)
+        else:
+            multi_view_positions = [None] * num_of_views
+        # Add None positions for additional tokens if they exist
+        if model_input.additional_input_tokens is not None:
+            additional_tokens_positions = [None] * model_input.additional_input_tokens.shape[1]
+            multi_view_positions = multi_view_positions + additional_tokens_positions
+        # Add positional encoding for reference view (idx 0)
+        ref_view_pe = self.view_pos_table[0].clone().detach()
+        ref_view_pe = ref_view_pe.reshape((1, 1, self.dim))
+        ref_view_pe = ref_view_pe.repeat(batch_size, num_of_tokens_per_view, 1)
+        ref_view_features = multi_view_features[:, :num_of_tokens_per_view, :]
+        ref_view_features = ref_view_features + ref_view_pe
+        # Add positional encoding for non-reference views (sequential indices starting from idx 1 or random indices which are uniformly sampled)
+        if self.use_rand_idx_pe_for_non_reference_views:
+            non_ref_view_pe_indices = torch.randint(low=1, high=self.max_num_views, size=(num_of_views - 1,))
+        else:
+            non_ref_view_pe_indices = torch.arange(1, num_of_views)
+        non_ref_view_pe = self.view_pos_table[non_ref_view_pe_indices].clone().detach()
+        non_ref_view_pe = non_ref_view_pe.reshape((1, num_of_views - 1, self.dim))
+        non_ref_view_pe = non_ref_view_pe.repeat_interleave(num_of_tokens_per_view, dim=1)
+        non_ref_view_pe = non_ref_view_pe.repeat(batch_size, 1, 1)
+        non_ref_view_features = multi_view_features[
+            :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+        ]
+        non_ref_view_features = non_ref_view_features + non_ref_view_pe
+        # Concatenate the reference and non-reference view features
+        # Handle additional tokens (no view-based positional encoding for them)
+        if model_input.additional_input_tokens is not None:
+            additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features, additional_features], dim=1)
+        else:
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features], dim=1)
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            # Apply the self-attention block and update the multi-view features
+            multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+        # Normalize the output features
+        output_multi_view_features = self.norm(multi_view_features)
+        # Extract only the view features (excluding additional tokens)
+        view_features = output_multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+        # Reshape the output multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+        view_features = view_features.reshape(batch_size, num_of_views, height, width, self.dim)  # (N, V, H, W, C)
+        view_features = view_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+        # Split the output multi-view features into separate views
+        view_features = view_features.split(1, dim=1)
+        view_features = [output_view_features.squeeze(dim=1) for output_view_features in view_features]
+        # Extract and return additional token features if provided
+        if model_input.additional_input_tokens is not None:
+            additional_token_features = output_multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+            return MultiViewTransformerOutput(
+                features=view_features, additional_token_features=additional_token_features
+            )
+        else:
+            return MultiViewTransformerOutput(features=view_features)
+class MultiViewGlobalAttentionTransformerIFR(MultiViewGlobalAttentionTransformer, IntermediateFeatureReturner):
+    "Intermediate Feature Returner for UniCeption Multi-View Global-Attention Transformer"
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        max_num_views: int,
+        use_rand_idx_pe_for_non_reference_views: bool,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: nn.Module = Mlp,
+        custom_positional_encoding: Callable = None,
+        pretrained_checkpoint_path: str = None,
+        indices: Optional[Union[int, List[int]]] = None,
+        norm_intermediate: bool = True,
+        intermediates_only: bool = False,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Multi-View Global-Attention Transformer for information sharing across image features from different views.
+        Extends the base class to return intermediate features.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            max_num_views (int): Maximum number of views for positional encoding.
+            use_rand_idx_pe_for_non_reference_views (bool): Whether to use random index positional encoding for non-reference views.
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            indices (Optional[Union[int, List[int]]], optional): Indices of the layers to return. (default: None) Options:
+            - None: Return all intermediate layers.
+            - int: Return the last n layers.
+            - List[int]: Return the intermediate layers at the specified indices.
+            norm_intermediate (bool, optional): Whether to normalize the intermediate features. (default: True)
+            intermediates_only (bool, optional): Whether to return only the intermediate features. (default: False)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Init the base classes
+        MultiViewGlobalAttentionTransformer.__init__(
+            self,
+            name=name,
+            input_embed_dim=input_embed_dim,
+            max_num_views=max_num_views,
+            use_rand_idx_pe_for_non_reference_views=use_rand_idx_pe_for_non_reference_views,
+            size=size,
+            depth=depth,
+            dim=dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            proj_drop=proj_drop,
+            attn_drop=attn_drop,
+            init_values=init_values,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            mlp_layer=mlp_layer,
+            custom_positional_encoding=custom_positional_encoding,
+            pretrained_checkpoint_path=pretrained_checkpoint_path,
+            gradient_checkpointing=gradient_checkpointing,
+            *args,
+            **kwargs,
+        )
+        IntermediateFeatureReturner.__init__(
+            self,
+            indices=indices,
+            norm_intermediate=norm_intermediate,
+            intermediates_only=intermediates_only,
+        )
+    def forward(
+        self,
+        model_input: MultiViewTransformerInput,
+    ) -> Union[
+        List[MultiViewTransformerOutput],
+        Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]],
+    ]:
+        """
+        Forward interface for the Multi-View Global-Attention Transformer with Intermediate Feature Return.
+        Args:
+            model_input (MultiViewTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, height, width),
+                where each entry corresponds to a different view.
+                Optionally, the input can also include additional_input_tokens (e.g., class token, registers, pose tokens, scale token)
+                which are appended to the token set from the multi-view features. The tokens are of size (batch, input_embed_dim, num_of_additional_tokens).
+        Returns:
+            Union[List[MultiViewTransformerOutput], Tuple[MultiViewTransformerOutput, List[MultiViewTransformerOutput]]]:
+                Output of the model post information sharing.
+                If intermediates_only is True, returns a list of intermediate outputs.
+                If intermediates_only is False, returns a tuple of final output and a list of intermediate outputs.
+        """
+        # Check that the number of views matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) <= self.max_num_views
+        ), f"Expected {self.num_views} views, got {len(model_input.features)}"
+        assert all(
+            curr_view_features.shape[1] == self.input_embed_dim for curr_view_features in model_input.features
+        ), f"All views must have input dimension {self.input_embed_dim}"
+        assert all(
+            curr_view_features.ndim == 4 for curr_view_features in model_input.features
+        ), "All views must have 4 dimensions (N, C, H, W)"
+        # Get the indices of the intermediate features to return
+        intermediate_multi_view_features = []
+        take_indices, _ = feature_take_indices(self.depth, self.indices)
+        # Initialize the multi-view features from the model input and number of views for current input
+        multi_view_features = model_input.features
+        num_of_views = len(multi_view_features)
+        batch_size, _, height, width = multi_view_features[0].shape
+        num_of_tokens_per_view = height * width
+        # Stack the multi-view features (N, C, H, W) to (N, V, C, H, W) (assumes all V views have same shape)
+        multi_view_features = torch.stack(multi_view_features, dim=1)
+        # Resize the multi-view features from NVCHW to NLC, where L = V * H * W
+        multi_view_features = multi_view_features.permute(0, 1, 3, 4, 2)  # (N, V, H, W, C)
+        multi_view_features = multi_view_features.reshape(
+            batch_size, num_of_views * height * width, self.input_embed_dim
+        ).contiguous()
+        # Process additional input tokens if provided
+        if model_input.additional_input_tokens is not None:
+            additional_tokens = model_input.additional_input_tokens
+            assert additional_tokens.ndim == 3, "Additional tokens must have 3 dimensions (N, C, T)"
+            assert (
+                additional_tokens.shape[1] == self.input_embed_dim
+            ), f"Additional tokens must have input dimension {self.input_embed_dim}"
+            assert additional_tokens.shape[0] == batch_size, "Batch size mismatch for additional tokens"
+            # Reshape to channel-last format for transformer processing
+            additional_tokens = additional_tokens.permute(0, 2, 1).contiguous()  # (N, C, T) -> (N, T, C)
+            # Concatenate the additional tokens to the multi-view features
+            multi_view_features = torch.cat([multi_view_features, additional_tokens], dim=1)
+        # Project input features to the transformer dimension
+        multi_view_features = self.proj_embed(multi_view_features)
+        # Create patch positions for each view if custom positional encoding is used
+        if self.custom_positional_encoding is not None:
+            multi_view_positions = [
+                self.position_getter(batch_size, height, width, multi_view_features.device)
+            ] * num_of_views  # List of length V, where each tensor is (N, H * W, C)
+            multi_view_positions = torch.cat(multi_view_positions, dim=1)  # (N, V * H * W, C)
+        else:
+            multi_view_positions = [None] * num_of_views
+        # Add None positions for additional tokens if they exist
+        if model_input.additional_input_tokens is not None:
+            additional_tokens_positions = [None] * model_input.additional_input_tokens.shape[1]
+            multi_view_positions = multi_view_positions + additional_tokens_positions
+        # Add positional encoding for reference view (idx 0)
+        ref_view_pe = self.view_pos_table[0].clone().detach()
+        ref_view_pe = ref_view_pe.reshape((1, 1, self.dim))
+        ref_view_pe = ref_view_pe.repeat(batch_size, num_of_tokens_per_view, 1)
+        ref_view_features = multi_view_features[:, :num_of_tokens_per_view, :]
+        ref_view_features = ref_view_features + ref_view_pe
+        # Add positional encoding for non-reference views (sequential indices starting from idx 1 or random indices which are uniformly sampled)
+        if self.use_rand_idx_pe_for_non_reference_views:
+            non_ref_view_pe_indices = torch.randint(low=1, high=self.max_num_views, size=(num_of_views - 1,))
+        else:
+            non_ref_view_pe_indices = torch.arange(1, num_of_views)
+        non_ref_view_pe = self.view_pos_table[non_ref_view_pe_indices].clone().detach()
+        non_ref_view_pe = non_ref_view_pe.reshape((1, num_of_views - 1, self.dim))
+        non_ref_view_pe = non_ref_view_pe.repeat_interleave(num_of_tokens_per_view, dim=1)
+        non_ref_view_pe = non_ref_view_pe.repeat(batch_size, 1, 1)
+        non_ref_view_features = multi_view_features[
+            :, num_of_tokens_per_view : num_of_views * num_of_tokens_per_view, :
+        ]
+        non_ref_view_features = non_ref_view_features + non_ref_view_pe
+        # Concatenate the reference and non-reference view features
+        # Handle additional tokens (no view-based positional encoding for them)
+        if model_input.additional_input_tokens is not None:
+            additional_features = multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features, additional_features], dim=1)
+        else:
+            multi_view_features = torch.cat([ref_view_features, non_ref_view_features], dim=1)
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            # Apply the self-attention block and update the multi-view features
+            multi_view_features = self.self_attention_blocks[depth_idx](multi_view_features, multi_view_positions)
+            if depth_idx in take_indices:
+                # Normalize the intermediate features with final norm layer if enabled
+                intermediate_multi_view_features.append(
+                    self.norm(multi_view_features) if self.norm_intermediate else multi_view_features
+                )
+        # Reshape the intermediate features and convert to MultiViewTransformerOutput class
+        for idx in range(len(intermediate_multi_view_features)):
+            # Get the current intermediate features
+            current_features = intermediate_multi_view_features[idx]
+            # Extract additional token features if provided
+            additional_token_features = None
+            if model_input.additional_input_tokens is not None:
+                additional_token_features = current_features[:, num_of_views * num_of_tokens_per_view :, :]
+                additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+                # Only keep the view features for reshaping
+                current_features = current_features[:, : num_of_views * num_of_tokens_per_view, :]
+            # Reshape the intermediate multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+            current_features = current_features.reshape(
+                batch_size, num_of_views, height, width, self.dim
+            )  # (N, V, H, W, C)
+            current_features = current_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+            # Split the intermediate multi-view features into separate views
+            current_features = current_features.split(1, dim=1)
+            current_features = [
+                intermediate_view_features.squeeze(dim=1) for intermediate_view_features in current_features
+            ]
+            intermediate_multi_view_features[idx] = MultiViewTransformerOutput(
+                features=current_features, additional_token_features=additional_token_features
+            )
+        # Return only the intermediate features if enabled
+        if self.intermediates_only:
+            return intermediate_multi_view_features
+        # Normalize the output features
+        output_multi_view_features = self.norm(multi_view_features)
+        # Extract view features (excluding additional tokens)
+        additional_token_features = None
+        if model_input.additional_input_tokens is not None:
+            additional_token_features = output_multi_view_features[:, num_of_views * num_of_tokens_per_view :, :]
+            additional_token_features = additional_token_features.permute(0, 2, 1).contiguous()  # (N, C, T)
+            view_features = output_multi_view_features[:, : num_of_views * num_of_tokens_per_view, :]
+        else:
+            view_features = output_multi_view_features
+        # Reshape the output multi-view features (N, V * H * W, C) back to (N, V, C, H, W)
+        view_features = view_features.reshape(batch_size, num_of_views, height, width, self.dim)  # (N, V, H, W, C)
+        view_features = view_features.permute(0, 1, 4, 2, 3).contiguous()  # (N, V, C, H, W)
+        # Split the output multi-view features into separate views
+        view_features = view_features.split(1, dim=1)
+        view_features = [output_view_features.squeeze(dim=1) for output_view_features in view_features]
+        output_multi_view_features = MultiViewTransformerOutput(
+            features=view_features, additional_token_features=additional_token_features
+        )
+        return output_multi_view_features, intermediate_multi_view_features
+class GlobalAttentionTransformer(UniCeptionInfoSharingBase):
+    "UniCeption Global-Attention Transformer for information sharing across different set of features."
+    def __init__(
+        self,
+        name: str,
+        input_embed_dim: int,
+        max_num_sets: int,
+        use_rand_idx_pe_for_non_reference_sets: bool,
+        size: Optional[str] = None,
+        depth: int = 12,
+        dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Union[Type[nn.Module], Callable[..., nn.Module]] = partial(nn.LayerNorm, eps=1e-6),
+        mlp_layer: Type[nn.Module] = Mlp,
+        pretrained_checkpoint_path: Optional[str] = None,
+        gradient_checkpointing: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the Global-Attention Transformer for information sharing across features from different sets.
+        Args:
+            input_embed_dim (int): Dimension of input embeddings.
+            max_num_sets (int): Maximum number of sets for positional encoding.
+            use_rand_idx_pe_for_non_reference_sets (bool): Whether to use random index positional encoding for non-reference sets.
+            size (str): String to indicate interpretable size of the transformer (for e.g., base, large, ...). (default: None)
+            depth (int): Number of transformer layers. (default: 12, base size)
+            dim (int): Dimension of the transformer. (default: 768, base size)
+            num_heads (int): Number of attention heads. (default: 12, base size)
+            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
+            qkv_bias (bool): Whether to include bias in qkv projection (default: True)
+            qk_norm (bool): Whether to normalize q and k (default: False)
+            proj_drop (float): Dropout rate for output (default: 0.)
+            attn_drop (float): Dropout rate for attention weights (default: 0.)
+            init_values (float): Initial value for LayerScale gamma (default: None)
+            drop_path (float): Dropout rate for stochastic depth (default: 0.)
+            act_layer (nn.Module): Activation layer (default: nn.GELU)
+            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
+            mlp_layer (nn.Module): MLP layer (default: Mlp)
+            pretrained_checkpoint_path (str, optional): Path to the pretrained checkpoint. (default: None)
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing for memory efficiency. (default: False)
+        """
+        # Initialize the base class
+        super().__init__(name=name, size=size, *args, **kwargs)
+        # Initialize the specific attributes of the transformer
+        self.input_embed_dim = input_embed_dim
+        self.max_num_sets = max_num_sets
+        self.use_rand_idx_pe_for_non_reference_sets = use_rand_idx_pe_for_non_reference_sets
+        self.depth = depth
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.proj_drop = proj_drop
+        self.attn_drop = attn_drop
+        self.init_values = init_values
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.mlp_layer = mlp_layer
+        self.pretrained_checkpoint_path = pretrained_checkpoint_path
+        self.gradient_checkpointing = gradient_checkpointing
+        # Initialize the projection layer for input embeddings
+        if self.input_embed_dim != self.dim:
+            self.proj_embed = nn.Linear(self.input_embed_dim, self.dim, bias=True)
+        else:
+            self.proj_embed = nn.Identity()
+        # Initialize the self-attention blocks which ingest all sets at once
+        self.self_attention_blocks = nn.ModuleList(
+            [
+                SelfAttentionBlock(
+                    dim=self.dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    qk_norm=self.qk_norm,
+                    proj_drop=self.proj_drop,
+                    attn_drop=self.attn_drop,
+                    init_values=self.init_values,
+                    drop_path=self.drop_path,
+                    act_layer=self.act_layer,
+                    norm_layer=self.norm_layer,
+                    mlp_layer=self.mlp_layer,
+                )
+                for _ in range(self.depth)
+            ]
+        )
+        # Initialize the final normalization layer
+        self.norm = self.norm_layer(self.dim)
+        # Initialize the positional encoding table for the different sets
+        self.register_buffer(
+            "set_pos_table",
+            self._get_sinusoid_encoding_table(self.max_num_sets, self.dim, 10000),
+        )
+        # Initialize random weights
+        self.initialize_weights()
+        # Load pretrained weights if provided
+        if self.pretrained_checkpoint_path is not None:
+            print(f"Loading pretrained global-attention transformer weights from {self.pretrained_checkpoint_path} ...")
+            ckpt = torch.load(self.pretrained_checkpoint_path, weights_only=False)
+            print(self.load_state_dict(ckpt["model"]))
+        # Apply gradient checkpointing if enabled
+        if self.gradient_checkpointing:
+            for i, block in enumerate(self.self_attention_blocks):
+                self.self_attention_blocks[i] = self.wrap_module_with_gradient_checkpointing(block)
+    def _get_sinusoid_encoding_table(self, n_position, d_hid, base):
+        "Sinusoid position encoding table"
+        def get_position_angle_vec(position):
+            return [position / np.power(base, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
+        return torch.FloatTensor(sinusoid_table)
+    def initialize_weights(self):
+        "Initialize weights of the transformer."
+        # Linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        "Initialize the transformer linear and layer norm weights."
+        if isinstance(m, nn.Linear):
+            # We use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        model_input: MultiSetTransformerInput,
+    ) -> MultiSetTransformerOutput:
+        """
+        Forward interface for the Multi-Set Global-Attention Transformer.
+        Args:
+            model_input (MultiSetTransformerInput): Input to the model.
+                Expects the features to be a list of size (batch, input_embed_dim, num_tokens),
+                where each entry corresponds to a different set of tokens and
+                the number of tokens can be different for each set.
+                Optionally, the input can also include additional_input_tokens (e.g., class token, registers, pose tokens, scale token)
+                which are appended to the token set from the multi-view features. The tokens are of size (batch, input_embed_dim, num_of_additional_tokens).
+        Returns:
+            MultiSetTransformerOutput: Output of the model post information sharing.
+        """
+        # Check that the number of sets matches the input and the features are of expected shape
+        assert (
+            len(model_input.features) <= self.max_num_sets
+        ), f"Expected less than {self.max_num_sets} sets, got {len(model_input.features)}"
+        assert all(
+            set_features.shape[1] == self.input_embed_dim for set_features in model_input.features
+        ), f"All sets must have input dimension {self.input_embed_dim}"
+        assert all(
+            set_features.ndim == 3 for set_features in model_input.features
+        ), "All sets must have 3 dimensions (N, C, T)"
+        # Initialize the multi-set features from the model input and number of sets for current input
+        multi_set_features = model_input.features
+        num_of_sets = len(multi_set_features)
+        batch_size, _, _ = multi_set_features[0].shape
+        num_of_tokens_per_set = [set_features.shape[2] for set_features in multi_set_features]
+        # Permute the multi-set features from (N, C, T) to (N, T, C)
+        multi_set_features = [set_features.permute(0, 2, 1).contiguous() for set_features in multi_set_features]
+        # Stack the multi-set features along the number of tokens dimension
+        multi_set_features = torch.cat(multi_set_features, dim=1)
+        # Process additional input tokens if provided
+        if model_input.additional_input_tokens is not None:
+            additional_tokens = model_input.additional_input_tokens
+            assert additional_tokens.ndim == 3, "Additional tokens must have 3 dimensions (N, C, T)"
+            assert (
+                additional_tokens.shape[1] == self.input_embed_dim
+            ), f"Additional tokens must have input dimension {self.input_embed_dim}"
+            assert additional_tokens.shape[0] == batch_size, "Batch size mismatch for additional tokens"
+            # Reshape to channel-last format for transformer processing
+            additional_tokens = additional_tokens.permute(0, 2, 1).contiguous()  # (N, C, T) -> (N, T, C)
+            # Concatenate the additional tokens to the multi-set features
+            multi_set_features = torch.cat([multi_set_features, additional_tokens], dim=1)
+        # Project input features to the transformer dimension
+        multi_set_features = self.proj_embed(multi_set_features)
+        # Create dummy patch positions for each set
+        multi_set_positions = [None] * num_of_sets
+        # Add positional encoding for reference set (idx 0)
+        ref_set_pe = self.set_pos_table[0].clone().detach()
+        ref_set_pe = ref_set_pe.reshape((1, 1, self.dim))
+        ref_set_pe = ref_set_pe.repeat(batch_size, num_of_tokens_per_set[0], 1)
+        ref_set_features = multi_set_features[:, : num_of_tokens_per_set[0], :]
+        ref_set_features = ref_set_features + ref_set_pe
+        # Add positional encoding for non-reference sets (sequential indices starting from idx 1 or random indices which are uniformly sampled)
+        if self.use_rand_idx_pe_for_non_reference_sets:
+            non_ref_set_pe_indices = torch.randint(low=1, high=self.max_num_sets, size=(num_of_sets - 1,))
+        else:
+            non_ref_set_pe_indices = torch.arange(1, num_of_sets)
+        non_ref_set_pe_list = []
+        for non_ref_set_idx in range(1, num_of_sets):
+            non_ref_set_pe_for_idx = self.set_pos_table[non_ref_set_pe_indices[non_ref_set_idx - 1]].clone().detach()
+            non_ref_set_pe_for_idx = non_ref_set_pe_for_idx.reshape((1, 1, self.dim))
+            non_ref_set_pe_for_idx = non_ref_set_pe_for_idx.repeat(
+                batch_size, num_of_tokens_per_set[non_ref_set_idx], 1
+            )
+            non_ref_set_pe_list.append(non_ref_set_pe_for_idx)
+        non_ref_set_pe = torch.cat(non_ref_set_pe_list, dim=1)
+        non_ref_set_features = multi_set_features[:, num_of_tokens_per_set[0] : sum(num_of_tokens_per_set), :]
+        non_ref_set_features = non_ref_set_features + non_ref_set_pe
+        # Concatenate the reference and non-reference set features
+        # Handle additional tokens (no set-based positional encoding for them)
+        if model_input.additional_input_tokens is not None:
+            additional_features = multi_set_features[:, sum(num_of_tokens_per_set) :, :]
+            multi_set_features = torch.cat([ref_set_features, non_ref_set_features, additional_features], dim=1)
+        else:
+            multi_set_features = torch.cat([ref_set_features, non_ref_set_features], dim=1)
+        # Add None positions for additional tokens if they exist
+        if model_input.additional_input_tokens is not None:
+            additional_tokens_positions = [None] * model_input.additional_input_tokens.shape[2]
+            multi_set_positions = multi_set_positions + additional_tokens_positions
+        # Loop over the depth of the transformer
+        for depth_idx in range(self.depth):
+            # Apply the self-attention block and update the multi-set features
+            multi_set_features = self.self_attention_blocks[depth_idx](multi_set_features, multi_set_positions)
+        # Normalize the output features
+        output_multi_set_features = self.norm(multi_set_features)
+        # Extract additional token features if provided
+        additional_token_features = None
+        if model_input.additional_input_tokens is not None:
+            additional_token_features = output_multi_set_features[:, sum(num_of_tokens_per_set) :, :]
+            additional_token_features = additional_token_features.permute(
+                0, 2, 1
+            ).contiguous()  # (N, T, C) -> (N, C, T)
+            # Only keep the set features for reshaping
+            output_multi_set_features = output_multi_set_features[:, : sum(num_of_tokens_per_set), :]
+        # Reshape the output multi-set features from (N, T, C) to (N, C, T)
+        output_multi_set_features = output_multi_set_features.permute(0, 2, 1).contiguous()
+        # Split the output multi-set features into separate sets using the list of number of tokens per set
+        output_multi_set_features = torch.split(output_multi_set_features, num_of_tokens_per_set, dim=2)
+        # Return the output multi-set features with additional token features if provided
+        return MultiSetTransformerOutput(
+            features=output_multi_set_features, additional_token_features=additional_token_features
+        )
+def dummy_positional_encoding(x, xpos):
+    "Dummy function for positional encoding of tokens"
+    x = x
+    xpos = xpos
+    return x
+if __name__ == "__main__":
+    # Init multi-view global-attention transformer with no custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewGlobalAttentionTransformer with {num_views} views ...")
+        # Sequential idx based positional encoding
+        model = MultiViewGlobalAttentionTransformer(
+            name="MV-GAT", input_embed_dim=1024, max_num_views=1000, use_rand_idx_pe_for_non_reference_views=False
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+        # Random idx based positional encoding
+        model = MultiViewGlobalAttentionTransformer(
+            name="MV-GAT", input_embed_dim=1024, max_num_views=1000, use_rand_idx_pe_for_non_reference_views=True
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    # Init multi-view global-attention transformer with custom positional encoding and run a forward pass
+    for num_views in [2, 3, 4]:
+        print(f"Testing MultiViewGlobalAttentionTransformer with {num_views} views and custom positional encoding ...")
+        model = MultiViewGlobalAttentionTransformer(
+            name="MV-GAT",
+            input_embed_dim=1024,
+            max_num_views=1000,
+            use_rand_idx_pe_for_non_reference_views=True,
+            custom_positional_encoding=dummy_positional_encoding,
+        )
+        model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+        model_input = MultiViewTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_views
+        assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    print("All multi-view global-attention transformers initialized and tested successfully!")
+    # Intermediate Feature Returner Tests
+    print("Running Intermediate Feature Returner Tests ...")
+    # Run the intermediate feature returner with last-n index
+    model_intermediate_feature_returner = MultiViewGlobalAttentionTransformerIFR(
+        name="MV-GAT-IFR",
+        input_embed_dim=1024,
+        max_num_views=1000,
+        use_rand_idx_pe_for_non_reference_views=True,
+        indices=6,  # Last 6 layers
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 6
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Run the intermediate feature returner with specific indices
+    model_intermediate_feature_returner = MultiViewGlobalAttentionTransformerIFR(
+        name="MV-GAT-IFR",
+        input_embed_dim=1024,
+        max_num_views=1000,
+        use_rand_idx_pe_for_non_reference_views=True,
+        indices=[0, 2, 4, 6],  # Specific indices
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert len(output[1]) == 4
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert len(output[1][0].features) == 2
+    # Test the normalizing of intermediate features
+    model_intermediate_feature_returner = MultiViewGlobalAttentionTransformerIFR(
+        name="MV-GAT-IFR",
+        input_embed_dim=1024,
+        max_num_views=1000,
+        use_rand_idx_pe_for_non_reference_views=True,
+        indices=[-1],  # Last layer
+        norm_intermediate=False,  # Disable normalization
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert not torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be different."
+    model_intermediate_feature_returner = MultiViewGlobalAttentionTransformerIFR(
+        name="MV-GAT-IFR",
+        input_embed_dim=1024,
+        max_num_views=1000,
+        use_rand_idx_pe_for_non_reference_views=True,
+        indices=[-1],  # Last layer
+        norm_intermediate=True,
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(2)]
+    model_input = MultiViewTransformerInput(features=model_input)
+    output = model_intermediate_feature_returner(model_input)
+    for view_idx in range(2):
+        assert torch.equal(
+            output[0].features[view_idx], output[1][-1].features[view_idx]
+        ), "Final features and intermediate features (last layer) must be same."
+    print("All Intermediate Feature Returner Tests passed!")
+    # Init multi-set global-attention transformer and run a forward pass with different number of sets and set token sizes
+    import random
+    model = GlobalAttentionTransformer(
+        name="GAT", input_embed_dim=1024, max_num_sets=3, use_rand_idx_pe_for_non_reference_sets=False
+    )
+    for num_sets in [2, 3]:
+        print(f"Testing GlobalAttentionTransformer with {num_sets} sets ...")
+        model_input = [torch.rand(1, 1024, random.randint(256, 513)) for _ in range(num_sets)]
+        model_input = MultiSetTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_sets
+        for feat, rand_input in zip(model_output.features, model_input.features):
+            assert feat.shape[2] == rand_input.shape[2]
+            assert feat.shape[1] == model.dim
+            assert feat.shape[0] == rand_input.shape[0]
+    # Random idx based positional encoding
+    model = GlobalAttentionTransformer(
+        name="GAT", input_embed_dim=1024, max_num_sets=1000, use_rand_idx_pe_for_non_reference_sets=True
+    )
+    for num_sets in [2, 3, 4]:
+        print(f"Testing GlobalAttentionTransformer with {num_sets} sets ...")
+        model_input = [torch.rand(1, 1024, random.randint(256, 513)) for _ in range(num_sets)]
+        model_input = MultiSetTransformerInput(features=model_input)
+        model_output = model(model_input)
+        assert len(model_output.features) == num_sets
+        for feat, rand_input in zip(model_output.features, model_input.features):
+            assert feat.shape[2] == rand_input.shape[2]
+            assert feat.shape[1] == model.dim
+            assert feat.shape[0] == rand_input.shape[0]
+    print("All Global Attention Transformer Tests passed!")
+    # Test additional input tokens for MultiViewGlobalAttentionTransformer
+    print("Testing MultiViewGlobalAttentionTransformer with additional input tokens...")
+    model = MultiViewGlobalAttentionTransformer(
+        name="MV-GAT", input_embed_dim=1024, max_num_views=1000, use_rand_idx_pe_for_non_reference_views=False
+    )
+    num_views = 2
+    num_additional_tokens = 5
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+    additional_tokens = torch.rand(1, 1024, num_additional_tokens)
+    model_input = MultiViewTransformerInput(features=model_input, additional_input_tokens=additional_tokens)
+    model_output = model(model_input)
+    assert len(model_output.features) == num_views
+    assert all(f.shape == (1, model.dim, 14, 14) for f in model_output.features)
+    assert model_output.additional_token_features is not None
+    assert model_output.additional_token_features.shape == (1, model.dim, num_additional_tokens)
+    # Test additional input tokens for MultiViewGlobalAttentionTransformerIFR
+    print("Testing MultiViewGlobalAttentionTransformerIFR with additional input tokens...")
+    model_ifr = MultiViewGlobalAttentionTransformerIFR(
+        name="MV-GAT-IFR",
+        input_embed_dim=1024,
+        max_num_views=1000,
+        use_rand_idx_pe_for_non_reference_views=True,
+        indices=[0, 2, 4],
+    )
+    model_input = [torch.rand(1, 1024, 14, 14) for _ in range(num_views)]
+    additional_tokens = torch.rand(1, 1024, num_additional_tokens)
+    model_input = MultiViewTransformerInput(features=model_input, additional_input_tokens=additional_tokens)
+    output = model_ifr(model_input)
+    assert isinstance(output, tuple)
+    assert isinstance(output[0], MultiViewTransformerOutput)
+    assert output[0].additional_token_features is not None
+    assert output[0].additional_token_features.shape == (1, model_ifr.dim, num_additional_tokens)
+    assert len(output[1]) == 3
+    assert all(isinstance(intermediate, MultiViewTransformerOutput) for intermediate in output[1])
+    assert all(intermediate.additional_token_features is not None for intermediate in output[1])
+    assert all(
+        intermediate.additional_token_features.shape == (1, model_ifr.dim, num_additional_tokens)
+        for intermediate in output[1]
+    )
+    # Test additional input tokens for GlobalAttentionTransformer
+    print("Testing GlobalAttentionTransformer with additional input tokens...")
+    model = GlobalAttentionTransformer(
+        name="GAT", input_embed_dim=1024, max_num_sets=1000, use_rand_idx_pe_for_non_reference_sets=False
+    )
+    num_sets = 3
+    num_additional_tokens = 8
+    model_input = [torch.rand(1, 1024, random.randint(256, 513)) for _ in range(num_sets)]
+    additional_tokens = torch.rand(1, 1024, num_additional_tokens)
+    model_input = MultiSetTransformerInput(features=model_input, additional_input_tokens=additional_tokens)
+    model_output = model(model_input)
+    assert len(model_output.features) == num_sets
+    for feat, rand_input in zip(model_output.features, model_input.features):
+        assert feat.shape[2] == rand_input.shape[2]
+        assert feat.shape[1] == model.dim
+        assert feat.shape[0] == rand_input.shape[0]
+    assert model_output.additional_token_features is not None
+    assert model_output.additional_token_features.shape == (1, model.dim, num_additional_tokens)
+    print("All tests using additional input tokens passed!")

UniCeption/uniception/models/libs/__init__.py ADDED Viewed

File without changes