Spaces:

Ruicheng
/

MoGe-2

Running on Zero

App Files Files Community

Ruicheng commited on Jul 1

Commit

201ab98

0 Parent(s):

Initial commit for HF

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
.gitignore +423 -0
CHANGELOG.md +32 -0
CODE_OF_CONDUCT.md +9 -0
LICENSE +224 -0
README.md +14 -0
SECURITY.md +41 -0
SUPPORT.md +25 -0
app.py +298 -0
assets/overview_simplified.png +3 -0
assets/panorama_pipeline.png +3 -0
baselines/da_v2.py +88 -0
baselines/da_v2_metric.py +99 -0
baselines/metric3d_v2.py +117 -0
baselines/moge.py +83 -0
configs/eval/all_benchmarks.json +78 -0
configs/eval/benchmarks/ddad.json +9 -0
configs/eval/benchmarks/diode.json +9 -0
configs/eval/benchmarks/eth3d.json +10 -0
configs/eval/benchmarks/gso.json +8 -0
configs/eval/benchmarks/hammer.json +10 -0
configs/eval/benchmarks/ibims-1.json +10 -0
configs/eval/benchmarks/kitti.json +9 -0
configs/eval/benchmarks/nyu.json +8 -0
configs/eval/benchmarks/sintel.json +10 -0
configs/eval/benchmarks/spring.json +9 -0
configs/train/v1.json +77 -0
docs/eval.md +77 -0
docs/train.md +181 -0
example_images/01_HouseIndoor.jpg +3 -0
example_images/02_Office.jpg +3 -0
example_images/03_Traffic.jpg +3 -0
example_images/04_BunnyCake.jpg +3 -0
example_images/05_Mountain.jpg +3 -0
example_images/06_MaitreyaBuddha.png +3 -0
example_images/07_Breads.jpg +3 -0
example_images/08_CatGirl.png +3 -0
example_images/09_Restaurant.jpg +3 -0
example_images/10_MedievalVillage.jpg +3 -0
example_images/11_Room.jpg +3 -0
example_images/12_StylizedHouses.jpg +3 -0
example_images/panorama/Braunschweig_Panoram.jpg +3 -0
moge/__init__.py +0 -0
moge/model/__init__.py +18 -0
moge/model/dinov2/__init__.py +6 -0
moge/model/dinov2/hub/__init__.py +4 -0
moge/model/dinov2/hub/backbones.py +156 -0
moge/model/dinov2/hub/utils.py +39 -0
moge/model/dinov2/layers/__init__.py +11 -0
moge/model/dinov2/layers/attention.py +89 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,423 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+# Python
+*.egg-info/
+/build
+# MoGe
+/data*
+/download
+/extract
+/debug
+/workspace
+/mlruns
+/infer_output
+/video_output
+/eval_output
+/.blobcache
+/test_images
+/test_videos
+/vis
+/videos
+/blobmnt
+/eval_dump
+/pretrained
+/.gradio
+/tmp

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,32 @@

+## 2024-11-28
+### Added
+- Supported user-provided camera FOV. See [scripts/infer.py](scripts/infer.py) --fov_x.
+  - Related issues: [#25](https://github.com/microsoft/MoGe/issues/25) and [#24](https://github.com/microsoft/MoGe/issues/24).
+- Added inference scripts for panorama images. See [scripts/infer_panorama.py](scripts/infer_panorama.py).
+  - Related issue: [#19](https://github.com/microsoft/MoGe/issues/19).
+### Fixed
+- Suppressed unnecessary numpy runtime warnings.
+- Specified recommended versions of requirements.
+  - Related issue: [#21](https://github.com/microsoft/MoGe/issues/21).
+### Changed
+- Moved `app.py` and `infer.py` to [scripts/](scripts/)
+- Improved edge removal.
+## 2025-03-18
+### Added
+- Training and evaluation code. See [docs/train.md](docs/train.md) and [docs/eval.md](docs/eval.md).
+- Supported installation via pip. Thanks to @fabiencastan and @jgoueslard
+ for commits in the [#47](https://github.com/microsoft/MoGe/pull/47)
+- Supported command-line usage when installed.
+### Changed
+- Moved `scripts/` into `moge/` for package installation and command-line usage.
+- Renamed `moge.model.moge_model` to `moge.model.v1` for version management.
+  Now you can import the model class through `from moge.model.v1 import MoGeModel` or `from moge.model import import_model_class_by_version; MoGeModel = import_model_class_by_version('v1')`.
+- Exposed `num_tokens` parameter in MoGe model.
+## 2025-06-10
+### Added
+- Released MoGe-2.

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

LICENSE ADDED Viewed

	@@ -0,0 +1,224 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: MoGe 2
+emoji: 🚀
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+sdk_version: 5.33.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Monocular metric-scale geometry estimation
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

SUPPORT.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# TODO: The maintainer of this repo has not yet edited this file
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+## Microsoft Support Policy
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.

app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import os
+os.environ['OPENCV_IO_ENABLE_OPENEXR'] = '1'
+import sys
+from pathlib import Path
+import time
+import uuid
+import tempfile
+import itertools
+from typing import *
+import atexit
+from concurrent.futures import ThreadPoolExecutor
+import shutil
+import click
+@click.command(help='Web demo')
+@click.option('--share', is_flag=True, help='Whether to run the app in shared mode.')
+@click.option('--pretrained', 'pretrained_model_name_or_path', default=None, help='The name or path of the pre-trained model.')
+@click.option('--version', 'model_version', default='v2', help='The version of the model.')
+def main(share: bool, pretrained_model_name_or_path: str, model_version: str, use_fp16: bool = True):
+    print("Import modules...")
+    # Lazy import
+    import cv2
+    import torch
+    import numpy as np
+    import trimesh
+    import trimesh.visual
+    from PIL import Image
+    import gradio as gr
+    try:
+        import spaces   # This is for deployment at huggingface.co/spaces
+        HUGGINFACE_SPACES_INSTALLED = True
+    except ImportError:
+        HUGGINFACE_SPACES_INSTALLED = False
+    import utils3d
+    from moge.utils.io import write_normal
+    from moge.utils.vis import colorize_depth, colorize_normal
+    from moge.model import import_model_class_by_version
+    from moge.utils.geometry_numpy import depth_occlusion_edge_numpy
+    from moge.utils.tools import timeit
+    print("Load model...")
+    if pretrained_model_name_or_path is None:
+        DEFAULT_PRETRAINED_MODEL_FOR_EACH_VERSION = {
+            "v1": "Ruicheng/moge-vitl",
+            "v2": "Ruicheng/moge-2-vitl-normal",
+        }
+        pretrained_model_name_or_path = DEFAULT_PRETRAINED_MODEL_FOR_EACH_VERSION[model_version]
+    model = import_model_class_by_version(model_version).from_pretrained(pretrained_model_name_or_path).cuda().eval()
+    if use_fp16:
+        model.half()
+    thread_pool_executor = ThreadPoolExecutor(max_workers=1)
+    def delete_later(path: Union[str, os.PathLike], delay: int = 300):
+        def _delete():
+            try:
+                os.remove(path)
+            except FileNotFoundError:
+                pass
+        def _wait_and_delete():
+            time.sleep(delay)
+            _delete(path)
+        thread_pool_executor.submit(_wait_and_delete)
+        atexit.register(_delete)
+    # Inference on GPU.
+    @(spaces.GPU if HUGGINFACE_SPACES_INSTALLED else lambda x: x)
+    def run_with_gpu(image: np.ndarray, resolution_level: int, apply_mask: bool) -> Dict[str, np.ndarray]:
+        image_tensor = torch.tensor(image, dtype=torch.float32 if not use_fp16 else torch.float16, device=torch.device('cuda')).permute(2, 0, 1) / 255
+        output = model.infer(image_tensor, apply_mask=apply_mask, resolution_level=resolution_level, use_fp16=use_fp16)
+        output = {k: v.cpu().numpy() for k, v in output.items()}
+        return output
+    # Full inference pipeline
+    def run(image: np.ndarray, max_size: int = 800, resolution_level: str = 'High',  apply_mask: bool = True, remove_edge: bool = True, request: gr.Request = None):
+        larger_size = max(image.shape[:2])
+        if larger_size > max_size:
+            scale = max_size / larger_size
+            image = cv2.resize(image, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+        height, width = image.shape[:2]
+        resolution_level_int = {'Low': 0, 'Medium': 5, 'High': 9, 'Ultra': 18}.get(resolution_level, 9)
+        output = run_with_gpu(image, resolution_level_int, apply_mask)
+        points, depth, mask, normal = output['points'], output['depth'], output['mask'], output.get('normal', None)
+        if remove_edge:
+            mask_cleaned = mask & ~utils3d.numpy.depth_edge(depth, rtol=0.04)
+        else:
+            mask_cleaned = mask
+        results = {
+            **output,
+            'mask_cleaned': mask_cleaned,
+            'image': image
+        }
+        # depth & normal visualization
+        depth_vis = colorize_depth(depth)
+        if normal is not None:
+            normal_vis = colorize_normal(normal)
+        else:
+            normal_vis = gr.update(label="Normal map (not avalable for this model)")
+        # mesh & pointcloud
+        if normal is None:
+            faces, vertices, vertex_colors, vertex_uvs = utils3d.numpy.image_mesh(
+                points,
+                image.astype(np.float32) / 255,
+                utils3d.numpy.image_uv(width=width, height=height),
+                mask=mask_cleaned,
+                tri=True
+            )
+            vertex_normals = None
+        else:
+            faces, vertices, vertex_colors, vertex_uvs, vertex_normals = utils3d.numpy.image_mesh(
+                points,
+                image.astype(np.float32) / 255,
+                utils3d.numpy.image_uv(width=width, height=height),
+                normal,
+                mask=mask_cleaned,
+                tri=True
+            )
+        vertices = vertices * np.array([1, -1, -1], dtype=np.float32)
+        vertex_uvs = vertex_uvs * np.array([1, -1], dtype=np.float32) + np.array([0, 1], dtype=np.float32)
+        if vertex_normals is not None:
+            vertex_normals = vertex_normals * np.array([1, -1, -1], dtype=np.float32)
+        tempdir = Path(tempfile.gettempdir(), 'moge')
+        tempdir.mkdir(exist_ok=True)
+        output_path = Path(tempdir, request.session_hash)
+        shutil.rmtree(output_path, ignore_errors=True)
+        output_path.mkdir(exist_ok=True, parents=True)
+        trimesh.Trimesh(
+            vertices=vertices,
+            faces=faces,
+            vertex_normals=vertex_normals,
+            visual = trimesh.visual.texture.TextureVisuals(
+                uv=vertex_uvs,
+                material=trimesh.visual.material.PBRMaterial(
+                    baseColorTexture=Image.fromarray(image),
+                    metallicFactor=0.5,
+                    roughnessFactor=1.0
+                )
+            ),
+            process=False
+        ).export(output_path / 'mesh.glb')
+        pointcloud = trimesh.PointCloud(
+            vertices=vertices,
+            colors=vertex_colors,
+        )
+        pointcloud.vertex_normals = vertex_normals
+        pointcloud.export(output_path / 'pointcloud.ply', vertex_normal=True)
+        trimesh.PointCloud(
+            vertices=vertices,
+            colors=vertex_colors,
+        ).export(output_path / 'pointcloud.glb', include_normals=True)
+        cv2.imwrite(str(output_path /'mask.png'), mask.astype(np.uint8) * 255)
+        cv2.imwrite(str(output_path / 'depth.exr'), depth.astype(np.float32), [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_FLOAT])
+        cv2.imwrite(str(output_path / 'points.exr'), cv2.cvtColor(points.astype(np.float32), cv2.COLOR_RGB2BGR), [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_FLOAT])
+        if normal is not None:
+            cv2.imwrite(str(output_path / 'normal.exr'), cv2.cvtColor(normal.astype(np.float32) * np.array([1, -1, -1], dtype=np.float32), cv2.COLOR_RGB2BGR), [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+        files = ['mesh.glb', 'pointcloud.ply', 'depth.exr', 'points.exr', 'mask.png']
+        if normal is not None:
+            files.append('normal.exr')
+        for f in files:
+            delete_later(output_path / f)
+        # FOV
+        intrinsics = results['intrinsics']
+        fov_x, fov_y = utils3d.numpy.intrinsics_to_fov(intrinsics)
+        fov_x, fov_y = np.rad2deg([fov_x, fov_y])
+        # messages
+        viewer_message = f'**Note:** Inference has been completed. It may take a few seconds to download the 3D model.'
+        if resolution_level != 'Ultra':
+            depth_message = f'**Note:** Want sharper depth map? Try increasing the `maximum image size` and setting the `inference resolution level` to `Ultra` in the settings.'
+        else:
+            depth_message = ""
+        return (
+            results,
+            depth_vis,
+            normal_vis,
+            output_path / 'pointcloud.glb',
+            [(output_path / f).as_posix() for f in files if (output_path / f).exists()],
+            f'- **Horizontal FOV: {fov_x:.1f}°**. \n - **Vertical FOV: {fov_y:.1f}°**',
+            viewer_message,
+            depth_message
+        )
+    def reset_measure(results: Dict[str, np.ndarray]):
+        return [results['image'], [], ""]
+    def measure(results: Dict[str, np.ndarray], measure_points: List[Tuple[int, int]], event: gr.SelectData):
+        point2d = event.index[0], event.index[1]
+        measure_points.append(point2d)
+        image = results['image'].copy()
+        for p in measure_points:
+            image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
+        depth_text = ""
+        for i, p in enumerate(measure_points):
+            d = results['depth'][p[1], p[0]]
+            depth_text += f"- **P{i + 1} depth: {d:.2f}m.**\n"
+        if len(measure_points) == 2:
+            point1, point2 = measure_points
+            image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
+            distance = np.linalg.norm(results['points'][point1[1], point1[0]] - results['points'][point2[1], point2[0]])
+            measure_points = []
+            distance_text = f"- **Distance: {distance:.2f}m**"
+            text = depth_text + distance_text
+            return [image, measure_points, text]
+        else:
+            return [image, measure_points, depth_text]
+    print("Create Gradio app...")
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+f'''
+<div align="center">
+<h1> Turn a 2D image into 3D with MoGe <a title="Github" href="https://github.com/microsoft/MoGe" target="_blank" rel="noopener noreferrer" style="display: inline-block;"> <img src="https://img.shields.io/github/stars/microsoft/MoGe?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars"> </a> </h1>
+</div>
+''')
+        results = gr.State(value=None)
+        measure_points = gr.State(value=[])
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="numpy", image_mode="RGB", label="Input Image")
+                with gr.Accordion(label="Settings", open=False):
+                    max_size_input = gr.Number(value=800, label="Maximum Image Size", precision=0, minimum=256, maximum=2048)
+                    resolution_level = gr.Dropdown(['Low', 'Medium', 'High', 'Ultra'], label="Inference Resolution Level", value='High')
+                    apply_mask = gr.Checkbox(value=True, label="Apply mask")
+                    remove_edges = gr.Checkbox(value=True, label="Remove edges")
+                submit_btn = gr.Button("Submit", variant='primary')
+            with gr.Column():
+                with gr.Tabs():
+                    with gr.Tab("3D View"):
+                        viewer_message = gr.Markdown("")
+                        model_3d = gr.Model3D(display_mode="solid", label="3D Point Map", clear_color=[1.0, 1.0, 1.0, 1.0], height="60vh")
+                        fov = gr.Markdown()
+                    with gr.Tab("Depth"):
+                        depth_message = gr.Markdown("")
+                        depth_map = gr.Image(type="numpy", label="Colorized Depth Map", format='png', interactive=False)
+                    with gr.Tab("Normal", interactive=hasattr(model, 'normal_head')):
+                        normal_map = gr.Image(type="numpy", label="Normal Map", format='png', interactive=False)
+                    with gr.Tab("Measure", interactive=hasattr(model, 'scale_head')):
+                        gr.Markdown("### Click on the image to measure the distance between two points. \n"
+                         "**Note:** Metric scale is most reliable for typical indoor or street scenes, and may degrade for contents unfamiliar to the model (e.g., stylized or close-up images).")
+                        measure_image = gr.Image(type="numpy", show_label=False, format='webp', interactive=False, sources=[])
+                        measure_text = gr.Markdown("")
+                    with gr.Tab("Download"):
+                        files = gr.File(type='filepath', label="Output Files")
+        if Path('example_images').exists():
+            example_image_paths = sorted(list(itertools.chain(*[Path('example_images').glob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg', 'JPG', 'PNG', 'JPEG']])))
+            examples = gr.Examples(
+                examples = example_image_paths,
+                inputs=input_image,
+                label="Examples"
+            )
+        submit_btn.click(
+            fn=lambda: [None, None, None, None, None, "", "", ""],
+            outputs=[results, depth_map, normal_map, model_3d, files, fov, viewer_message, depth_message]
+        ).then(
+            fn=run,
+            inputs=[input_image, max_size_input, resolution_level, apply_mask, remove_edges],
+            outputs=[results, depth_map, normal_map, model_3d, files, fov, viewer_message, depth_message]
+        ).then(
+            fn=reset_measure,
+            inputs=[results],
+            outputs=[measure_image, measure_points, measure_text]
+        )
+        measure_image.select(
+            fn=measure,
+            inputs=[results, measure_points],
+            outputs=[measure_image, measure_points, measure_text]
+        )
+    demo.launch(share=share)
+if __name__ == '__main__':
+    main()

assets/overview_simplified.png ADDED Viewed

Git LFS Details

SHA256: 7025a671e863bddbc22e79dc3e2eca8b7aeaf35fe93f6ef7f2b18f4fc9e093e6
Pointer size: 131 Bytes
Size of remote file: 414 kB

assets/panorama_pipeline.png ADDED Viewed

Git LFS Details

SHA256: ed28c5309162bddda016ca600307ecc73f7e6415f9eaaefb9f6fffadf6951aaa
Pointer size: 131 Bytes
Size of remote file: 738 kB

baselines/da_v2.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Reference: https://github.com/DepthAnything/Depth-Anything-V2
+import os
+import sys
+from typing import *
+from pathlib import Path
+import click
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, repo_path: str, backbone: str, num_tokens: int, device: Union[torch.device, str]):
+        # Create from repo
+        repo_path = os.path.abspath(repo_path)
+        if repo_path not in sys.path:
+            sys.path.append(repo_path)
+        if not Path(repo_path).exists():
+            raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
+        from depth_anything_v2.dpt import DepthAnythingV2
+        device = torch.device(device)
+        # Instantiate model
+        model = DepthAnythingV2(encoder=backbone, features=256, out_channels=[256, 512, 1024, 1024])
+        # Load checkpoint
+        checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_{backbone}.pth')
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
+        model.load_state_dict(checkpoint)
+        model.to(device).eval()
+        self.model = model
+        self.num_tokens = num_tokens
+        self.device = device
+    @click.command()
+    @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
+    @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Encoder architecture.')
+    @click.option('--num_tokens', type=int, default=None, help='Number of tokens to use for the input image.')
+    @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
+    @staticmethod
+    def load(repo_path: str, backbone, num_tokens: int, device: torch.device = 'cuda'):
+        return Baseline(repo_path, backbone, num_tokens, device)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        original_height, original_width = image.shape[-2:]
+        assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+            omit_batch_dim = True
+        else:
+            omit_batch_dim = False
+        if self.num_tokens is None:
+            resize_factor = 518 / min(original_height, original_width)
+            expected_width = round(original_width * resize_factor / 14) * 14
+            expected_height = round(original_height * resize_factor / 14) * 14
+        else:
+            aspect_ratio = original_width / original_height
+            tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
+            tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
+            expected_width = tokens_cols * 14
+            expected_height = tokens_rows * 14
+        image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
+        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        disparity = self.model(image)
+        disparity = F.interpolate(disparity[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
+        if omit_batch_dim:
+            disparity = disparity.squeeze(0)
+        return {
+            'disparity_affine_invariant': disparity
+        }

baselines/da_v2_metric.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Reference https://github.com/DepthAnything/Depth-Anything-V2/metric_depth
+import os
+import sys
+from typing import *
+from pathlib import Path
+import click
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+import cv2
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
+        device = torch.device(device)
+        repo_path = os.path.abspath(repo_path)
+        if not Path(repo_path).exists():
+            raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
+        sys.path.append(os.path.join(repo_path, 'metric_depth'))
+        from depth_anything_v2.dpt import DepthAnythingV2
+        model_configs = {
+            'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+            'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+            'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+        }
+        if domain == 'indoor':
+            dataset = 'hypersim'
+            max_depth = 20
+        elif domain == 'outdoor':
+            dataset = 'vkitti'
+            max_depth = 80
+        else:
+            raise ValueError(f"Invalid domain: {domain}")
+        model = DepthAnythingV2(**model_configs[backbone], max_depth=max_depth)
+        checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_metric_{dataset}_{backbone}.pth')
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu', weights_only=True))
+        model.eval().to(device)
+        self.model = model
+        self.num_tokens = num_tokens
+        self.device = device
+    @click.command()
+    @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
+    @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Backbone architecture.')
+    @click.option('--domain', type=click.Choice(['indoor', 'outdoor']), help='Domain of the dataset.')
+    @click.option('--num_tokens', type=int, default=None, help='Number of tokens for the ViT model')
+    @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
+    @staticmethod
+    def load(repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
+        return Baseline(repo_path, backbone, domain, num_tokens, device)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        original_height, original_width = image.shape[-2:]
+        assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+            omit_batch_dim = True
+        else:
+            omit_batch_dim = False
+        if self.num_tokens is None:
+            resize_factor = 518 / min(original_height, original_width)
+            expected_width = round(original_width * resize_factor / 14) * 14
+            expected_height = round(original_height * resize_factor / 14) * 14
+        else:
+            aspect_ratio = original_width / original_height
+            tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
+            tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
+            expected_width = tokens_cols * 14
+            expected_height = tokens_rows * 14
+        image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
+        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        depth = self.model(image)
+        depth = F.interpolate(depth[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
+        if omit_batch_dim:
+            depth = depth.squeeze(0)
+        return {
+            'depth_metric': depth
+        }

baselines/metric3d_v2.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Reference: https://github.com/YvanYin/Metric3D
+import os
+import sys
+from typing import *
+import click
+import torch
+import torch.nn.functional as F
+import cv2
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, backbone: Literal['vits', 'vitl', 'vitg'], device):
+        backbone_map = {
+            'vits': 'metric3d_vit_small',
+            'vitl': 'metric3d_vit_large',
+            'vitg': 'metric3d_vit_giant2'
+        }
+        device = torch.device(device)
+        model = torch.hub.load('yvanyin/metric3d', backbone_map[backbone], pretrain=True)
+        model.to(device).eval()
+        self.model = model
+        self.device = device
+    @click.command()
+    @click.option('--backbone', type=click.Choice(['vits', 'vitl', 'vitg']), default='vitl', help='Encoder architecture.')
+    @click.option('--device', type=str, default='cuda', help='Device to use.')
+    @staticmethod
+    def load(backbone: str = 'vitl', device: torch.device = 'cuda'):
+        return Baseline(backbone, device)
+    @torch.inference_mode()
+    def inference_one_image(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
+        # Reference: https://github.com/YvanYin/Metric3D/blob/main/mono/utils/do_test.py
+        # rgb_origin: RGB, 0-255, uint8
+        rgb_origin = image.cpu().numpy().transpose((1, 2, 0)) * 255
+        # keep ratio resize
+        input_size = (616, 1064) # for vit model
+        h, w = rgb_origin.shape[:2]
+        scale = min(input_size[0] / h, input_size[1] / w)
+        rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
+        if intrinsics is not None:
+            focal = intrinsics[0, 0] * int(w * scale)
+        # padding to input_size
+        padding = [123.675, 116.28, 103.53]
+        h, w = rgb.shape[:2]
+        pad_h = input_size[0] - h
+        pad_w = input_size[1] - w
+        pad_h_half = pad_h // 2
+        pad_w_half = pad_w // 2
+        rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
+        pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
+        # normalize rgb
+        mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
+        std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
+        rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
+        rgb = torch.div((rgb - mean), std)
+        rgb = rgb[None, :, :, :].cuda()
+        # inference
+        pred_depth, confidence, output_dict = self.model.inference({'input': rgb})
+        # un pad
+        pred_depth = pred_depth.squeeze()
+        pred_depth = pred_depth[pad_info[0] : pred_depth.shape[0] - pad_info[1], pad_info[2] : pred_depth.shape[1] - pad_info[3]]
+        pred_depth = pred_depth.clamp_min(0.5)  # clamp to 0.5m, since metric3d could yield very small depth values, resulting in crashed the scale shift alignment.
+        # upsample to original size
+        pred_depth = F.interpolate(pred_depth[None, None, :, :], image.shape[-2:], mode='bilinear').squeeze()
+        if intrinsics is not None:
+            # de-canonical transform
+            canonical_to_real_scale = focal / 1000.0 # 1000.0 is the focal length of canonical camera
+            pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric
+            pred_depth = torch.clamp(pred_depth, 0, 300)
+        pred_normal, normal_confidence = output_dict['prediction_normal'].split([3, 1], dim=1) # see https://arxiv.org/abs/2109.09881 for details
+        # un pad and resize to some size if needed
+        pred_normal = pred_normal.squeeze(0)
+        pred_normal = pred_normal[:, pad_info[0] : pred_normal.shape[1] - pad_info[1], pad_info[2] : pred_normal.shape[2] - pad_info[3]]
+        # you can now do anything with the normal
+        pred_normal = F.interpolate(pred_normal[None, :, :, :], image.shape[-2:], mode='bilinear').squeeze(0)
+        pred_normal = F.normalize(pred_normal, p=2, dim=0)
+        return pred_depth, pred_normal.permute(1, 2, 0)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
+        # image: (B, H, W, 3) or (H, W, 3)
+        if image.ndim == 3:
+            pred_depth, pred_normal = self.inference_one_image(image, intrinsics)
+        else:
+            for i in range(image.shape[0]):
+                pred_depth_i, pred_normal_i = self.inference_one_image(image[i], intrinsics[i] if intrinsics is not None else None)
+                pred_depth.append(pred_depth_i)
+                pred_normal.append(pred_normal_i)
+            pred_depth = torch.stack(pred_depth, dim=0)
+            pred_normal = torch.stack(pred_normal, dim=0)
+        if intrinsics is not None:
+            return {
+                "depth_metric": pred_depth,
+            }
+        else:
+            return {
+                "depth_scale_invariant": pred_depth,
+            }

baselines/moge.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import sys
+from typing import *
+import importlib
+import click
+import torch
+import utils3d
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
+        super().__init__()
+        from moge.model import import_model_class_by_version
+        MoGeModel = import_model_class_by_version(version)
+        self.version = version
+        self.model = MoGeModel.from_pretrained(pretrained_model_name_or_path).to(device).eval()
+        self.device = torch.device(device)
+        self.num_tokens = num_tokens
+        self.resolution_level = resolution_level
+        self.use_fp16 = use_fp16
+    @click.command()
+    @click.option('--num_tokens', type=int, default=None)
+    @click.option('--resolution_level', type=int, default=9)
+    @click.option('--pretrained', 'pretrained_model_name_or_path', type=str, default='Ruicheng/moge-vitl')
+    @click.option('--fp16', 'use_fp16', is_flag=True)
+    @click.option('--device', type=str, default='cuda:0')
+    @click.option('--version', type=str, default='v1')
+    @staticmethod
+    def load(num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
+        return Baseline(num_tokens, resolution_level, pretrained_model_name_or_path, use_fp16, device, version)
+    # Implementation for inference
+    @torch.inference_mode()
+    def infer(self, image: torch.FloatTensor, intrinsics: Optional[torch.FloatTensor] = None):
+        if intrinsics is not None:
+            fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
+            fov_x = torch.rad2deg(fov_x)
+        else:
+            fov_x = None
+        output = self.model.infer(image, fov_x=fov_x, apply_mask=True, num_tokens=self.num_tokens)
+        if self.version == 'v1':
+            return {
+                'points_scale_invariant': output['points'],
+                'depth_scale_invariant': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+        else:
+            return {
+                'points_metric': output['points'],
+                'depth_metric': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+    @torch.inference_mode()
+    def infer_for_evaluation(self, image: torch.FloatTensor, intrinsics: torch.FloatTensor = None):
+        if intrinsics is not None:
+            fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
+            fov_x = torch.rad2deg(fov_x)
+        else:
+            fov_x = None
+        output = self.model.infer(image, fov_x=fov_x, apply_mask=False, num_tokens=self.num_tokens, use_fp16=self.use_fp16)
+        if self.version == 'v1':
+            return {
+                'points_scale_invariant': output['points'],
+                'depth_scale_invariant': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+        else:
+            return {
+                'points_metric': output['points'],
+                'depth_metric': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }

configs/eval/all_benchmarks.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+    "NYUv2": {
+        "path": "data/eval/NYUv2",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "depth_unit": 1.0
+    },
+    "KITTI": {
+        "path": "data/eval/KITTI",
+        "width": 750,
+        "height": 375,
+        "split": ".index.txt",
+        "depth_unit": 1
+    },
+    "ETH3D": {
+        "path": "data/eval/ETH3D",
+        "width": 2048,
+        "height": 1365,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1
+    },
+    "iBims-1": {
+        "path": "data/eval/iBims-1",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "has_sharp_boundary": true,
+        "include_segmentation": true,
+        "depth_unit": 1.0
+    },
+    "GSO": {
+        "path": "data/eval/GSO",
+        "width": 512,
+        "height": 512,
+        "split": ".index.txt"
+    },
+    "Sintel": {
+        "path": "data/eval/Sintel",
+        "width": 872,
+        "height": 436,
+        "split": ".index.txt",
+        "has_sharp_boundary": true,
+        "include_segmentation": true
+    },
+    "DDAD": {
+        "path": "data/eval/DDAD",
+        "width": 1400,
+        "height": 700,
+        "include_segmentation": true,
+        "split": ".index.txt",
+        "depth_unit": 1.0
+    },
+    "DIODE": {
+        "path": "data/eval/DIODE",
+        "width": 1024,
+        "height": 768,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1.0
+    },
+    "Spring": {
+        "path": "data/eval/Spring",
+        "width": 1920,
+        "height": 1080,
+        "split": ".index.txt",
+        "has_sharp_boundary": true
+    },
+    "HAMMER": {
+        "path": "data/eval/HAMMER",
+        "width": 1664,
+        "height": 832,
+        "split": ".index.txt",
+        "depth_unit": 1,
+        "has_sharp_boundary": true
+    }
+}

configs/eval/benchmarks/ddad.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "DDAD": {
+        "path": "data/eval/DDAD",
+        "width": 1400,
+        "height": 700,
+        "include_segmentation": true,
+        "split": ".index.txt"
+    }
+}

configs/eval/benchmarks/diode.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "DIODE": {
+        "path": "data/eval/DIODE",
+        "width": 1024,
+        "height": 768,
+        "split": ".index.txt",
+        "include_segmentation": true
+    }
+}

configs/eval/benchmarks/eth3d.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "ETH3D": {
+        "path": "data/eval/ETH3D",
+        "width": 2048,
+        "height": 1365,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1
+    }
+}

configs/eval/benchmarks/gso.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "GSO": {
+        "path": "data/eval/GSO",
+        "width": 512,
+        "height": 512,
+        "split": ".index.txt"
+    }
+}

configs/eval/benchmarks/hammer.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "HAMMER": {
+        "path": "data/eval/HAMMER",
+        "width": 1664,
+        "height": 832,
+        "split": ".index.txt",
+        "depth_unit": 1,
+        "has_sharp_boundary": true
+    }
+}

configs/eval/benchmarks/ibims-1.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "iBims-1": {
+        "path": "data/eval/iBims-1",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "has_sharp_boundary": true
+    }
+}

configs/eval/benchmarks/kitti.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "KITTI": {
+        "path": "data/eval/KITTI",
+        "width": 750,
+        "height": 375,
+        "split": ".index.txt",
+        "depth_unit": 1
+    }
+}

configs/eval/benchmarks/nyu.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "NYUv2": {
+        "path": "data/eval/NYUv2",
+        "width": 640,
+        "height": 480,
+        "split": ".test.txt"
+    }
+}

configs/eval/benchmarks/sintel.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "Sintel": {
+        "path": "data/eval/Sintel",
+        "width": 872,
+        "height": 436,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "has_sharp_boundary": true
+    }
+}

configs/eval/benchmarks/spring.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "Spring": {
+        "path": "data/eval/Spring",
+        "width": 1920,
+        "height": 1080,
+        "split": ".test.txt",
+        "has_sharp_boundary": true
+    }
+}

configs/train/v1.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "data": {
+        "aspect_ratio_range": [0.5, 2.0],
+        "area_range": [250000, 1000000],
+        "clamp_max_depth": 1000.0,
+        "center_augmentation": 0.5,
+        "fov_range_absolute": [1, 179],
+        "fov_range_relative": [0.01, 1.0],
+        "image_augmentation": ["jittering", "jpeg_loss", "blurring"],
+        "datasets": [
+            {
+                "name": "TartanAir",
+                "path": "blobmnt/data_v3/TartanAir",
+                "label_type": "synthetic",
+                "index": ".index.txt",
+                "depth": "depth.png",
+                "weight": 4.8,
+                "center_augmentation": 0.25,
+                "fov_range_absolute": [30, 150],
+                "fov_range_relative": [0.5, 1.0],
+                "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
+            }
+        ]
+    },
+    "model_version": "v1",
+    "model": {
+        "encoder": "dinov2_vitl14",
+        "remap_output": "exp",
+        "intermediate_layers": 4,
+        "dim_upsample": [256, 128, 64],
+        "dim_times_res_block_hidden": 2,
+        "num_res_blocks": 2,
+        "num_tokens_range": [1200, 2500],
+        "last_conv_channels": 32,
+        "last_conv_size": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": [
+            {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
+            {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
+        ]
+    },
+    "lr_scheduler": {
+        "type": "SequentialLR",
+        "params": {
+            "schedulers": [
+                {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
+                {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
+            ],
+            "milestones": [2000]
+        }
+    },
+    "low_resolution_training_steps": 50000,
+    "loss": {
+        "invalid": {},
+        "synthetic": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
+            "normal": {"function": "normal_loss", "weight": 1.0},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "sfm": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "lidar": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        }
+    }
+}

docs/eval.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Evaluation
+We provide a unified evaluation script that runs baselines on multiple benchmarks. It takes a baseline model and evaluation configurations, evaluates on-the-fly, and reports results instantly in a JSON file.
+## Benchmarks
+Donwload the processed datasets from [Huggingface Datasets](https://huggingface.co/datasets/Ruicheng/monocular-geometry-evaluation) and put them in the `data/eval` directory, using `huggingface-cli`:
+```bash
+mkdir -p data/eval
+huggingface-cli download Ruicheng/monocular-geometry-evaluation --repo-type dataset --local-dir data/eval --local-dir-use-symlinks False
+```
+Then unzip the downloaded files:
+```bash
+cd data/eval
+unzip '*.zip'
+# rm *.zip # if you don't keep the zip files
+```
+## Configuration
+See [`configs/eval/all_benchmarks.json`](../configs/eval/all_benchmarks.json) for an example of evaluation configurations on all benchmarks. You can modify this file to evaluate on different benchmarks or different baselines.
+## Baseline
+Some examples of baselines are provided in [`baselines/`](../baselines/). Pass the path to the baseline model python code to the `--baseline` argument of the evaluation script.
+## Run Evaluation
+Run the script [`moge/scripts/eval_baseline.py`](../moge/scripts/eval_baseline.py).
+For example,
+```bash
+# Evaluate MoGe on the 10 benchmarks
+python moge/scripts/eval_baseline.py --baseline baselines/moge.py --config configs/eval/all_benchmarks.json --output eval_output/moge.json --pretrained Ruicheng/moge-vitl --resolution_level 9
+# Evaluate Depth Anything V2 on the 10 benchmarks. (NOTE: affine disparity)
+python moge/scripts/eval_baseline.py --baseline baselines/da_v2.py --config configs/eval/all_benchmarks.json --output eval_output/da_v2.json
+```
+The `--baselies` `--input` `--output` arguments are for the inference script. The rest arguments, e.g. `--pretrained` `--resolution_level`, are custormized for loading the baseline model.
+Details of the arguments:
+```
+Usage: eval_baseline.py [OPTIONS]
+  Evaluation script.
+Options:
+  --baseline PATH  Path to the baseline model python code.
+  --config PATH    Path to the evaluation configurations. Defaults to
+                   "configs/eval/all_benchmarks.json".
+  --output PATH    Path to the output json file.
+  --oracle         Use oracle mode for evaluation, i.e., use the GT intrinsics
+                   input.
+  --dump_pred      Dump predition results.
+  --dump_gt        Dump ground truth.
+  --help           Show this message and exit.
+```
+## Wrap a Customized Baseline
+Wrap any baseline method with [`moge.test.baseline.MGEBaselineInterface`](../moge/test/baseline.py).
+See [`baselines/`](../baselines/) for more examples.
+It is a good idea to check the correctness of the baseline implementation by running inference on a small set of images via [`moge/scripts/infer_baselines.py`](../moge/scripts/infer_baselines.py):
+```base
+python moge/scripts/infer_baselines.py --baseline baselines/moge.py --input example_images/ --output infer_outupt/moge --pretrained Ruicheng/moge-vitl --maps --ply
+```

docs/train.md ADDED Viewed

	@@ -0,0 +1,181 @@

+# Training
+This document provides instructions for training and finetuning the MoGe model.
+## Additional Requirements
+The following packages other than those listed in [`pyproject.toml`](../pyproject.toml) are required for training and finetuning the MoGe model:
+```
+accelerate
+sympy
+mlflow
+```
+## Data preparation
+### Dataset format
+Each dataset should be organized as follows:
+```
+somedataset
+├── .index.txt          # A list of instance paths
+├── folder1
+│   ├── instance1       # Each instance is in a folder
+│   │   ├── image.jpg   # RGB image.
+│   │   ├── depth.png   # 16-bit depth. See moge/utils/io.py for details
+│   │   ├── meta.json   # Stores "intrinsics" as a 3x3 matrix
+│   │   └── ...         # Other componests such as segmentation mask, normal map etc.
+...
+```
+* `.index.txt` is placed at top directory to store a list of instance paths in this dataset. The dataloader will look for instances in this list. You may also use a custom split, e.g. `.train.txt`, `.val.txt` and specify it in the configuration file.
+* For depth images, it is recommended to use `read_depth()` and `write_depth()` in [`moge/utils/io.py`](../moge/utils/io.py) to read and write depth images. The depth is stored in logarithmic scale in 16-bit PNG format, offering a balanced precision, dynamic range and compression ratio compared to 16-bit and 32-bit EXR and linear depth formats. It also encodes `NaN` and `Inf` values for invalid depth values.
+* The `meta.json` should be a dictionary containing the key `intrinsics`, which are **normalized** camera parameters. You may put more metadata.
+* We also support reading and storing segementation masks for evaluation data (see paper evaluation of local points), which are saved in PNG format with semantic labels stored in png metadata as JSON strings. See `read_segmentation()` and `write_segmentation()` in [`moge/utils/io.py`](../moge/utils/io.py) for details.
+### Visual inspection
+We provide a script to visualize the data and check the data quality. It will export the instance as a PLY file for visualization of point cloud.
+```bash
+python moge/scripts/vis_data.py PATH_TO_INSTANCE --ply [-o SOMEWHERE_ELSE_TO_SAVE_VIS]
+```
+### DataLoader
+Our training dataloaders is customized to handle loading data, performing perspective crop, and augmentation in a multithreading pipeline. Please refer to [`moge/train/dataloader.py`](../moge/train/dataloader.py) if you have any concern.
+## Configuration
+See [`configs/train/v1.json`](../configs/train/v1.json) for an example configuration file. The configuration file defines the hyperparameters for training the MoGe model.
+Here is a commented configuration for reference:
+```json
+{
+    "data": {
+        "aspect_ratio_range": [0.5, 2.0],               # Range of aspect ratio of sampled images
+        "area_range": [250000, 1000000],                # Range of sampled image area in pixels
+        "clamp_max_depth": 1000.0,                      # Maximum far/near
+        "center_augmentation": 0.5,                     # Ratio of center crop augmentation
+        "fov_range_absolute": [1, 179],                 # Absolute range of FOV in degrees
+        "fov_range_relative": [0.01, 1.0],              # Relative range of FOV to the original FOV
+        "image_augmentation": ["jittering", "jpeg_loss", "blurring"],       # List of image augmentation techniques
+        "datasets": [
+            {
+                "name": "TartanAir",                    # Name of the dataset. Name it as you like.
+                "path": "data/TartanAir",               # Path to the dataset
+                "label_type": "synthetic",              # Label type for this dataset. Losses will be applied accordingly. see "loss" config
+                "weight": 4.8,                          # Probability of sampling this dataset
+                "index": ".index.txt",                  # File name of the index file.  Defaults to .index.txt
+                "depth": "depth.png",                   # File name of depth images. Defaults to depth.png
+                "center_augmentation": 0.25,            # Below are dataset-specific hyperparameters. Overriding the global ones above.
+                "fov_range_absolute": [30, 150],
+                "fov_range_relative": [0.5, 1.0],
+                "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
+            }
+        ]
+    },
+    "model_version": "v1",                 # Model version. If you have multiple model variants, you can use this to switch between them.
+    "model": {                             # Model hyperparameters. Will be passed to Model __init__() as kwargs.
+        "encoder": "dinov2_vitl14",
+        "remap_output": "exp",
+        "intermediate_layers": 4,
+        "dim_upsample": [256, 128, 64],
+        "dim_times_res_block_hidden": 2,
+        "num_res_blocks": 2,
+        "num_tokens_range": [1200, 2500],
+        "last_conv_channels": 32,
+        "last_conv_size": 1
+    },
+    "optimizer": {                          # Reflection-like optimizer configurations. See moge.train.utils.py build_optimizer() for details.
+        "type": "AdamW",
+        "params": [
+            {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
+            {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
+        ]
+    },
+    "lr_scheduler": {                       # Reflection-like lr_scheduler configurations. See moge.train.utils.py build_lr_scheduler() for details.
+        "type": "SequentialLR",
+        "params": {
+            "schedulers": [
+                {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
+                {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
+            ],
+            "milestones": [2000]
+        }
+    },
+    "low_resolution_training_steps": 50000, # Total number of low-resolution training steps. It makes the early stage training faster. Later stage training on varying size images will be slower.
+    "loss": {
+        "invalid": {},                      # invalid instance due to runtime error when loading data
+        "synthetic": {                      # Below are loss hyperparameters
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
+            "normal": {"function": "normal_loss", "weight": 1.0},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "sfm": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "lidar": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        }
+    }
+}
+```
+## Run Training
+Launch the training script [`moge/scripts/train.py`](../moge/scripts/train.py). Note that we use [`accelerate`](https://github.com/huggingface/accelerate) for distributed training.
+```bash
+accelerate launch \
+    --num_processes 8 \
+    moge/scripts/train.py \
+    --config configs/train/v1.json \
+    --workspace workspace/debug \
+    --gradient_accumulation_steps 2 \
+    --batch_size_forward 2 \
+    --checkpoint latest \
+    --enable_gradient_checkpointing True \
+    --vis_every 1000 \
+    --enable_mlflow True
+```
+## Finetuning
+To finetune the pre-trained MoGe model, download the model checkpoint and put it in a local directory, e.g. `pretrained/moge-vitl.pt`.
+> NOTE: when finetuning pretrained MoGe model, a much lower learning rate is required.
+The suggested learning rate for finetuning is not greater than 1e-5 for the head and 1e-6 for the backbone.
+And the batch size is recommended to be 32 at least.
+The settings in default configuration are not optimal for specific datasets and may require further tuning.
+```bash
+accelerate launch \
+    --num_processes 8 \
+    moge/scripts/train.py \
+    --config configs/train/v1.json \
+    --workspace workspace/debug \
+    --gradient_accumulation_steps 2 \
+    --batch_size_forward 2 \
+    --checkpoint pretrained/moge-vitl.pt \
+    --enable_gradient_checkpointing True \
+    --vis_every 1000 \
+    --enable_mlflow True
+```

example_images/01_HouseIndoor.jpg ADDED Viewed

Git LFS Details

SHA256: 3eb519bc68d4262af0c68166ca69e786cac5f6656a1083f4c585c4a94005c859
Pointer size: 131 Bytes
Size of remote file: 322 kB

example_images/02_Office.jpg ADDED Viewed

Git LFS Details

SHA256: 28767640002f93b703b24a34a6d75ca24b1ef093a19f52ef0f9d3b074ef68c61
Pointer size: 131 Bytes
Size of remote file: 198 kB

example_images/03_Traffic.jpg ADDED Viewed

Git LFS Details

SHA256: 4fa8b46849dd3de5b3b0a141d6aafe98e190f578ccec0c9dacc440cd8434db11
Pointer size: 132 Bytes
Size of remote file: 1.13 MB

example_images/04_BunnyCake.jpg ADDED Viewed

Git LFS Details

SHA256: 7ddd187d91ebc2cf626bc51a26e1fc71d478237ce348732ae547f83655f05260
Pointer size: 130 Bytes
Size of remote file: 69.1 kB

example_images/05_Mountain.jpg ADDED Viewed

Git LFS Details

SHA256: 670d322f6588713f7d9c7349091de0aacb2a5b0b37c7b7433995e110fb2bcfbc
Pointer size: 131 Bytes
Size of remote file: 666 kB

example_images/06_MaitreyaBuddha.png ADDED Viewed

Git LFS Details

SHA256: 396c5fd722bf5a21b931cbb70b883d6b1d5f9bab439cc426ec2f606fc2b7872d
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

example_images/07_Breads.jpg ADDED Viewed

Git LFS Details

SHA256: a95c2cab81412e252ee5a56a6100df31bb83de0f117607ca8476478f7f152a7b
Pointer size: 131 Bytes
Size of remote file: 156 kB

example_images/08_CatGirl.png ADDED Viewed

Git LFS Details

SHA256: 57fa6d587d598e7a428e8997b86d5c3a06e0e18529bfad8bab78ae03a1f5820f
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

example_images/09_Restaurant.jpg ADDED Viewed

Git LFS Details

SHA256: b2bb7b5a1e91a174101109b0976b8ae2a4d6bb7d6eadad6569106ed102d0d5a6
Pointer size: 131 Bytes
Size of remote file: 794 kB

example_images/10_MedievalVillage.jpg ADDED Viewed

Git LFS Details

SHA256: 718ed1aeb1e0010194c5cf0e95371e6a29d45b84e93efbed63ff4cc60e74508b
Pointer size: 131 Bytes
Size of remote file: 465 kB

example_images/11_Room.jpg ADDED Viewed

Git LFS Details

SHA256: 8f34b99e89f3a57952bb88f11a6dc87e4a75423f55ad26748783c92854543cf5
Pointer size: 131 Bytes
Size of remote file: 582 kB

example_images/12_StylizedHouses.jpg ADDED Viewed

Git LFS Details

SHA256: 18120b27ea499ef9c921a5a02e987c687327896c7bb649a9703682737d25a6b8
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

example_images/panorama/Braunschweig_Panoram.jpg ADDED Viewed

Git LFS Details

SHA256: abc31b78f03a0b5254f3735bc3201c28d21b6855708f971ce4b6a740dfbddcba
Pointer size: 131 Bytes
Size of remote file: 563 kB

moge/__init__.py ADDED Viewed

File without changes

moge/model/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import importlib
+from typing import *
+if TYPE_CHECKING:
+    from .v1 import MoGeModel as MoGeModelV1
+    from .v2 import MoGeModel as MoGeModelV2
+def import_model_class_by_version(version: str) -> Type[Union['MoGeModelV1', 'MoGeModelV2']]:
+    assert version in ['v1', 'v2'], f'Unsupported model version: {version}'
+    try:
+        module = importlib.import_module(f'.{version}', __package__)
+    except ModuleNotFoundError:
+        raise ValueError(f'Model version "{version}" not found.')
+    cls = getattr(module, 'MoGeModel')
+    return cls

moge/model/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

moge/model/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

moge/model/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

moge/model/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

moge/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

moge/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x