Browse Source

Init commit

main
silencht 2 years ago
commit
b990c0eff3
  1. 13
      .gitignore
  2. 28
      LICENSE
  3. 197
      README.md
  4. 21
      act/LICENSE
  5. 8
      act/README.md
  6. 23
      act/conda_env.yaml
  7. 201
      act/detr/LICENSE
  8. 9
      act/detr/README.md
  9. 128
      act/detr/main.py
  10. 9
      act/detr/models/__init__.py
  11. 144
      act/detr/models/backbone.py
  12. 302
      act/detr/models/detr_vae.py
  13. 93
      act/detr/models/position_encoding.py
  14. 314
      act/detr/models/transformer.py
  15. 10
      act/detr/setup.py
  16. 1
      act/detr/util/__init__.py
  17. 88
      act/detr/util/box_ops.py
  18. 468
      act/detr/util/misc.py
  19. 107
      act/detr/util/plot_utils.py
  20. 367
      act/imitate_episodes.py
  21. 115
      act/policy.py
  22. 286
      act/utils.py
  23. 79
      assets/H1_5/README.md
  24. 1623
      assets/H1_5/h1_5.urdf
  25. 438
      assets/H1_5/h1_5.xml
  26. BIN
      assets/H1_5/meshes/L_hand_base_link.STL
  27. BIN
      assets/H1_5/meshes/L_index_intermediate.STL
  28. BIN
      assets/H1_5/meshes/L_index_proximal.STL
  29. BIN
      assets/H1_5/meshes/L_middle_intermediate.STL
  30. BIN
      assets/H1_5/meshes/L_middle_proximal.STL
  31. BIN
      assets/H1_5/meshes/L_pinky_intermediate.STL
  32. BIN
      assets/H1_5/meshes/L_pinky_proximal.STL
  33. BIN
      assets/H1_5/meshes/L_ring_intermediate.STL
  34. BIN
      assets/H1_5/meshes/L_ring_proximal.STL
  35. BIN
      assets/H1_5/meshes/L_thumb_distal.STL
  36. BIN
      assets/H1_5/meshes/L_thumb_intermediate.STL
  37. BIN
      assets/H1_5/meshes/L_thumb_proximal.STL
  38. BIN
      assets/H1_5/meshes/L_thumb_proximal_base.STL
  39. BIN
      assets/H1_5/meshes/R_hand_base_link.STL
  40. BIN
      assets/H1_5/meshes/R_index_intermediate.STL
  41. BIN
      assets/H1_5/meshes/R_index_proximal.STL
  42. BIN
      assets/H1_5/meshes/R_middle_intermediate.STL
  43. BIN
      assets/H1_5/meshes/R_middle_proximal.STL
  44. BIN
      assets/H1_5/meshes/R_pinky_intermediate.STL
  45. BIN
      assets/H1_5/meshes/R_pinky_proximal.STL
  46. BIN
      assets/H1_5/meshes/R_ring_intermediate.STL
  47. BIN
      assets/H1_5/meshes/R_ring_proximal.STL
  48. BIN
      assets/H1_5/meshes/R_thumb_distal.STL
  49. BIN
      assets/H1_5/meshes/R_thumb_intermediate.STL
  50. BIN
      assets/H1_5/meshes/R_thumb_proximal.STL
  51. BIN
      assets/H1_5/meshes/R_thumb_proximal_base.STL
  52. BIN
      assets/H1_5/meshes/left_ankle_A_link.STL
  53. BIN
      assets/H1_5/meshes/left_ankle_A_rod_link.STL
  54. BIN
      assets/H1_5/meshes/left_ankle_B_link.STL
  55. BIN
      assets/H1_5/meshes/left_ankle_B_rod_link.STL
  56. BIN
      assets/H1_5/meshes/left_ankle_pitch_link.STL
  57. BIN
      assets/H1_5/meshes/left_ankle_roll_link.STL
  58. BIN
      assets/H1_5/meshes/left_elbow_pitch_link.STL
  59. BIN
      assets/H1_5/meshes/left_elbow_roll_link.STL
  60. BIN
      assets/H1_5/meshes/left_hand_link.STL
  61. BIN
      assets/H1_5/meshes/left_hip_pitch_link.STL
  62. BIN
      assets/H1_5/meshes/left_hip_roll_link.STL
  63. BIN
      assets/H1_5/meshes/left_hip_yaw_link.STL
  64. BIN
      assets/H1_5/meshes/left_knee_link.STL
  65. BIN
      assets/H1_5/meshes/left_shoulder_pitch_link.STL
  66. BIN
      assets/H1_5/meshes/left_shoulder_roll_link.STL
  67. BIN
      assets/H1_5/meshes/left_shoulder_yaw_link.STL
  68. BIN
      assets/H1_5/meshes/left_wrist_pitch_link.STL
  69. BIN
      assets/H1_5/meshes/link11_L.STL
  70. BIN
      assets/H1_5/meshes/link11_R.STL
  71. BIN
      assets/H1_5/meshes/link12_L.STL
  72. BIN
      assets/H1_5/meshes/link12_R.STL
  73. BIN
      assets/H1_5/meshes/link13_L.STL
  74. BIN
      assets/H1_5/meshes/link13_R.STL
  75. BIN
      assets/H1_5/meshes/link14_L.STL
  76. BIN
      assets/H1_5/meshes/link14_R.STL
  77. BIN
      assets/H1_5/meshes/link15_L.STL
  78. BIN
      assets/H1_5/meshes/link15_R.STL
  79. BIN
      assets/H1_5/meshes/link16_L.STL
  80. BIN
      assets/H1_5/meshes/link16_R.STL
  81. BIN
      assets/H1_5/meshes/link17_L.STL
  82. BIN
      assets/H1_5/meshes/link17_R.STL
  83. BIN
      assets/H1_5/meshes/link18_L.STL
  84. BIN
      assets/H1_5/meshes/link18_R.STL
  85. BIN
      assets/H1_5/meshes/link19_L.STL
  86. BIN
      assets/H1_5/meshes/link19_R.STL
  87. BIN
      assets/H1_5/meshes/link20_L.STL
  88. BIN
      assets/H1_5/meshes/link20_R.STL
  89. BIN
      assets/H1_5/meshes/link21_L.STL
  90. BIN
      assets/H1_5/meshes/link21_R.STL
  91. BIN
      assets/H1_5/meshes/link22_L.STL
  92. BIN
      assets/H1_5/meshes/link22_R.STL
  93. BIN
      assets/H1_5/meshes/logo_link.STL
  94. BIN
      assets/H1_5/meshes/pelvis.STL
  95. BIN
      assets/H1_5/meshes/right_ankle_A_link.STL
  96. BIN
      assets/H1_5/meshes/right_ankle_A_rod_link.STL
  97. BIN
      assets/H1_5/meshes/right_ankle_B_link.STL
  98. BIN
      assets/H1_5/meshes/right_ankle_B_rod_link.STL
  99. BIN
      assets/H1_5/meshes/right_ankle_link.STL
  100. BIN
      assets/H1_5/meshes/right_ankle_pitch_link.STL

13
.gitignore

@ -0,0 +1,13 @@
.idea/
__MACOSX/
.DS_Store
*pycache*
*.pyc
*.egg-info
*.mp4
*build/
*.pem
*.webm
*.jpg
*.svo
*.png

28
LICENSE

@ -0,0 +1,28 @@
Copyright [2024] [Xuxin Cheng, Jialong Li, Shiqi Yang, Ge Yang and Xiaolong Wang]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
------------------
This code builds upon following open-source code-bases. Please visit the URLs to see the respective LICENSES:
1) https://github.com/tonyzhaozh/act
2) https://github.com/facebookresearch/detr
3) https://github.com/dexsuite/dex-retargeting
4) https://github.com/vuer-ai/vuer
5) https://github.com/stack-of-tasks/pinocchio
6) https://github.com/casadi/casadi
7) https://github.com/meshcat-dev/meshcat-python
8) https://github.com/zeromq/pyzmq
------------------

197
README.md

@ -0,0 +1,197 @@
# Video Demo
<img src="./img/1.webp" autoplay loop="loop" style="width: 49%" controls></img><img src="./img/2.webp" autoplay loop="loop" style="width: 49%" controls></img>
# Introduction
This repository implements teleoperation of the humanoid robot Unitree H1_2 using Apple Vision Pro.
# Prerequisites
We tested our code on Ubuntu 20.04 and Ubuntu 22.04, other operating systems may be configured differently.
For more information, you can refer to [Official documentation ]() and [OpenTeleVision](https://github.com/OpenTeleVision/TeleVision).
## inverse kinematics
```bash
conda create -n tv python=3.8
conda activate tv
# If you use `pip install`, Make sure pinocchio version is 3.1.0
conda install pinocchio -c conda-forge
pip install meshcat
```
## unitree_dds_wrapper
```bash
# Install the Python version of the unitree_dds_wrapper.
git clone https://github.com/unitreerobotics/unitree_dds_wrapper.git
cd unitree_dds_wrapper/python
pip3 install -e .
```
## TeleVision and Apple Vision Pro configuration
### basic
```bash
cd ~
git clone https://github.com/unitreerobotics/avp_teleoperate_robot.git
cd ~/avp_teleoperate_robot
pip install -r requirements.txt
cd act/detr && pip install -e .
```
### Isaac Gym
If you want to try teleoperation example in a simulated environment (teleop_hand.py):
1. Download Isaac Gym: https://developer.nvidia.com/isaac-gym/download
2. Extracting to the current directory, go to the `IsaacGym_Preview_4_Package/isaacgym/python` directory and execute the command: `pip install -e .`
### Local streaming
**Apple** does not allow WebXR on non-https connections. To test the application locally, we need to create a self-signed certificate and install it on the client. You need a ubuntu machine and a router. Connect the VisionPro and the ubuntu machine to the same router.
1. install mkcert: https://github.com/FiloSottile/mkcert
2. check local ip address:
```bash
ifconfig | grep inet
```
Suppose the local ip address of the ubuntu machine is `192.168.123.2`.
3. create certificate:
```bash
mkcert -install && mkcert -cert-file cert.pem -key-file key.pem 192.168.123.2 localhost 127.0.0.1
```
ps. place the generated cert.pem and key.pem files in teleop.
```bash
cp cert.pem key.pem ~/avp_teleoperate_robot/teleop/
```
4. open firewall on server:
```bash
sudo ufw allow 8012
```
5. install ca-certificates on VisionPro:
```
mkcert -CAROOT
```
Copy the rootCA.pem via AirDrop to VisionPro and install it.
Settings > General > About > Certificate Trust Settings. Under "Enable full trust for root certificates", turn on trust for the certificate.
settings > Apps > Safari > Advanced > Feature Flags > Enable WebXR Related Features
6. open the browser on Safari on VisionPro and go to https://192.168.123.2:8012?ws=wss://192.168.123.2:8012
7. Click "Enter VR" and Allow to start the VR session.
### Simulation Teleoperation Example
1. After setup up streaming with either local or network streaming following the above instructions, you can try teleoperating two robot hands in Issac Gym:
```bash
cd teleop && python teleop_hand.py
```
2. Go to your vuer site on VisionPro, click `Enter VR` and `Allow` to enter immersive environment.
3. See your hands in 3D!
<div style="center">
<img src="https://doc-cdn.unitree.com/static/2024/7/25/4b1b2327d4774abfbe8ef1c084d81cd7_2686x1627.png" width="50%">
</div>
<!-- <p style="text-align: center;">
<img src="https://doc-cdn.unitree.com/static/2024/7/25/4b1b2327d4774abfbe8ef1c084d81cd7_2686x1627.png" style="display: block; margin: auto; width: 30%;">
</p> -->
# Usage
## Dexterous hands service
On Unitree H1_2's PC, execute command:
```bash
sudo ./inspire_hand -s /dev/ttyUSB0
```
Open another terminal and execute the following command to test. If two hands open and close continuously, it indicates success.
```bash
./h1_hand_example
```
## Image Server
Copy `image_server.py` in the `avp_teleoperate_robot/teleop/image_server` directory to the PC of Unitree H1_2, and execute the following command **in the PC**:
```bash
sudo python image_server.py
```
After image service is started, you can use `image_client.py` **in the Host** terminal to test whether the communication is successful:
```bash
python image_client.py
```
## Start
```bash
python unitree_human_robot.py
```
# Codebase Tutorial
The overall structure of the code remains the same as TeleVision, and we will only focus on the directory of files related to Unitree Robot
avp_teleoperate_robot/
├── act [Documents Related to ACT Policy for Imitation Learning]
├── assets [Storage of robot URDF-related files]
├── scripts
├── teleop
│ ├── image_server/ [Image Transfer Server and Client Code]
│ │ ├── image_client.py [Client (only used to test whether the image stream service is OK, not used for teleoperation)]
│ │ ├── image_server.py [Capture images from binocular cameras and send via network (performed on Unitree H1_2)]
│ │
│ ├── robot_control/ [Storage of IK solver, arm and hand control related documents]
│ │ ├── robot_arm_ik.py [Inverse kinematics of the arm]
│ │ ├── robot_arm.py [Control dual arm joints and lock the others]
│ │ ├── robot_hand.py [Control hand joints]
│ │
│ │──teleop_hand_and_arm.py [Startup execution code for teleoperation]
| |——teleop_hand.py [Can be used for testing the environment configuration]
# Acknowledgement
This code builds upon following open-source code-bases. Please visit the URLs to see the respective LICENSES:
1) https://github.com/OpenTeleVision/TeleVision
2) https://github.com/dexsuite/dex-retargeting
3) https://github.com/vuer-ai/vuer
4) https://github.com/stack-of-tasks/pinocchio
5) https://github.com/casadi/casadi
6) https://github.com/meshcat-dev/meshcat-python
7) https://github.com/zeromq/pyzmq
8) https://github.com/unitreerobotics/unitree_dds_wrapper
9) https://github.com/tonyzhaozh/act
10) https://github.com/facebookresearch/detr

21
act/LICENSE

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Tony Z. Zhao
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

8
act/README.md

@ -0,0 +1,8 @@
This part of the codebase is modified from ACT https://github.com/tonyzhaozh/act under MIT License.
@article{zhao2023learning,
title={Learning fine-grained bimanual manipulation with low-cost hardware},
author={Zhao, Tony Z and Kumar, Vikash and Levine, Sergey and Finn, Chelsea},
journal={arXiv preprint arXiv:2304.13705},
year={2023}
}

23
act/conda_env.yaml

@ -0,0 +1,23 @@
name: aloha
channels:
- pytorch
- nvidia
- conda-forge
dependencies:
- python=3.9
- pip=23.0.1
- pytorch=2.0.0
- torchvision=0.15.0
- pytorch-cuda=11.8
- pyquaternion=0.9.9
- pyyaml=6.0
- rospkg=1.5.0
- pexpect=4.8.0
- mujoco=2.3.3
- dm_control=1.0.9
- py-opencv=4.7.0
- matplotlib=3.7.1
- einops=0.6.0
- packaging=23.0
- h5py=3.8.0
- ipython=8.12.0

201
act/detr/LICENSE

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 - present, Facebook, Inc
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

9
act/detr/README.md

@ -0,0 +1,9 @@
This part of the codebase is modified from DETR https://github.com/facebookresearch/detr under APACHE 2.0.
@article{Carion2020EndtoEndOD,
title={End-to-End Object Detection with Transformers},
author={Nicolas Carion and Francisco Massa and Gabriel Synnaeve and Nicolas Usunier and Alexander Kirillov and Sergey Zagoruyko},
journal={ArXiv},
year={2020},
volume={abs/2005.12872}
}

128
act/detr/main.py

@ -0,0 +1,128 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
from pathlib import Path
import numpy as np
import torch
from .models import build_ACT_model, build_CNNMLP_model
import IPython
e = IPython.embed
def get_args_parser():
parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
parser.add_argument('--lr', default=1e-4, type=float) # will be overridden
parser.add_argument('--lr_backbone', default=1e-5, type=float) # will be overridden
parser.add_argument('--batch_size', default=2, type=int) # not used
parser.add_argument('--weight_decay', default=1e-4, type=float)
parser.add_argument('--epochs', default=300, type=int) # not used
parser.add_argument('--lr_drop', default=200, type=int) # not used
parser.add_argument('--clip_max_norm', default=0.1, type=float, # not used
help='gradient clipping max norm')
parser.add_argument('--qpos_noise_std', action='store', default=0, type=float, help='lr', required=False)
# Model parameters
# * Backbone
parser.add_argument('--backbone', default='resnet18', type=str, # will be overridden
help="Name of the convolutional backbone to use")
parser.add_argument('--dilation', action='store_true',
help="If true, we replace stride with dilation in the last convolutional block (DC5)")
parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
help="Type of positional embedding to use on top of the image features")
parser.add_argument('--camera_names', default=[], type=list, # will be overridden
help="A list of camera names")
# * Transformer
parser.add_argument('--enc_layers', default=4, type=int, # will be overridden
help="Number of encoding layers in the transformer")
parser.add_argument('--dec_layers', default=6, type=int, # will be overridden
help="Number of decoding layers in the transformer")
parser.add_argument('--dim_feedforward', default=2048, type=int, # will be overridden
help="Intermediate size of the feedforward layers in the transformer blocks")
parser.add_argument('--hidden_dim', default=256, type=int, # will be overridden
help="Size of the embeddings (dimension of the transformer)")
parser.add_argument('--dropout', default=0.1, type=float,
help="Dropout applied in the transformer")
parser.add_argument('--nheads', default=8, type=int, # will be overridden
help="Number of attention heads inside the transformer's attentions")
parser.add_argument('--num_queries', default=400, type=int, # will be overridden
help="Number of query slots")
parser.add_argument('--pre_norm', action='store_true')
# * Segmentation
parser.add_argument('--masks', action='store_true',
help="Train segmentation head if the flag is provided")
# repeat args in imitate_episodes just to avoid error. Will not be used
parser.add_argument('--eval', action='store_true')
parser.add_argument('--onscreen_render', action='store_true')
# parser.add_argument('--ckpt_dir', action='store', type=str, help='ckpt_dir', required=True)
parser.add_argument('--policy_class', action='store', type=str, help='policy_class, capitalize', required=True)
# parser.add_argument('--task_name', action='store', type=str, help='task_name', required=True)
parser.add_argument('--seed', action='store', type=int, help='seed', required=True)
parser.add_argument('--num_epochs', action='store', type=int, help='num_epochs', required=True)
parser.add_argument('--kl_weight', action='store', type=int, help='KL Weight', required=False)
parser.add_argument('--chunk_size', action='store', type=int, help='chunk_size', required=False)
parser.add_argument('--temporal_agg', action='store_true')
parser.add_argument('--save_jit', action='store_true')
parser.add_argument('--ckpt_dir', default='/home/cxx/h1_hardware/data/logs', type=str, # will be overridden
help='ckpt_dir')
parser.add_argument('--no_wandb', action='store_true')
parser.add_argument('--resumeid', action='store', type=str, help='resume id', required=False)
parser.add_argument('--resume_ckpt', action='store', type=str, help='resume ckpt', required=False)
parser.add_argument('--taskid', action='store', type=str, help='task id', required=True)
parser.add_argument('--exptid', action='store', type=str, help='experiment id', required=True)
parser.add_argument('--source', choices=['self', 'ssd'], default='self')
return parser
def build_ACT_model_and_optimizer(args_override):
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
args = parser.parse_args()
for k, v in args_override.items():
setattr(args, k, v)
model = build_ACT_model(args)
model.cuda()
param_dicts = [
{"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
{
"params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
"lr": args.lr_backbone,
},
]
optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
weight_decay=args.weight_decay)
return model, optimizer
def build_CNNMLP_model_and_optimizer(args_override):
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
args = parser.parse_args()
for k, v in args_override.items():
setattr(args, k, v)
model = build_CNNMLP_model(args)
model.cuda()
param_dicts = [
{"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
{
"params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
"lr": args.lr_backbone,
},
]
optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
weight_decay=args.weight_decay)
return model, optimizer

9
act/detr/models/__init__.py

@ -0,0 +1,9 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .detr_vae import build as build_vae
from .detr_vae import build_cnnmlp as build_cnnmlp
def build_ACT_model(args):
return build_vae(args)
def build_CNNMLP_model(args):
return build_cnnmlp(args)

144
act/detr/models/backbone.py

@ -0,0 +1,144 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Backbone modules.
"""
from collections import OrderedDict
import torch
import torch.nn.functional as F
import torchvision
from torch import nn
from torchvision.models._utils import IntermediateLayerGetter
from typing import Dict, List
from util.misc import NestedTensor, is_main_process
from .position_encoding import build_position_encoding
import IPython
e = IPython.embed
class FrozenBatchNorm2d(torch.nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
Copy-paste from torchvision.misc.ops with added eps before rqsrt,
without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101]
produce nans.
"""
def __init__(self, n):
super(FrozenBatchNorm2d, self).__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
num_batches_tracked_key = prefix + 'num_batches_tracked'
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super(FrozenBatchNorm2d, self)._load_from_state_dict(
state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs)
def forward(self, x):
# move reshapes to the beginning
# to make it fuser-friendly
w = self.weight.reshape(1, -1, 1, 1)
b = self.bias.reshape(1, -1, 1, 1)
rv = self.running_var.reshape(1, -1, 1, 1)
rm = self.running_mean.reshape(1, -1, 1, 1)
eps = 1e-5
scale = w * (rv + eps).rsqrt()
bias = b - rm * scale
return x * scale + bias
class BackboneBase(nn.Module):
def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
super().__init__()
# for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this?
# if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
# parameter.requires_grad_(False)
if return_interm_layers:
return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
else:
return_layers = {'layer4': "0"}
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
self.num_channels = num_channels
def forward(self, tensor):
xs = self.body(tensor)
return xs
# out: Dict[str, NestedTensor] = {}
# for name, x in xs.items():
# m = tensor_list.mask
# assert m is not None
# mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
# out[name] = NestedTensor(x, mask)
# return out
class Backbone(BackboneBase):
"""ResNet backbone with frozen BatchNorm."""
def __init__(self, name: str,
train_backbone: bool,
return_interm_layers: bool,
dilation: bool):
backbone = getattr(torchvision.models, name)(
replace_stride_with_dilation=[False, False, dilation],
pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) # pretrained # TODO do we want frozen batch_norm??
num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
class DINOv2BackBone(nn.Module):
def __init__(self) -> None:
super().__init__()
self.body = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
self.body.eval()
self.num_channels = 384
@torch.no_grad()
def forward(self, tensor):
xs = self.body.forward_features(tensor)["x_norm_patchtokens"]
od = OrderedDict()
od["0"] = xs.reshape(xs.shape[0], 22, 16, 384).permute(0, 3, 2, 1)
return od
class Joiner(nn.Sequential):
def __init__(self, backbone, position_embedding):
super().__init__(backbone, position_embedding)
# def forward (self, tensor):
# xs = self[0](tensor)
# pos = self[1](xs)
# return xs, pos
def forward(self, tensor_list: NestedTensor):
xs = self[0](tensor_list)
out: List[NestedTensor] = []
pos = []
for name, x in xs.items():
out.append(x)
# position encoding
pos.append(self[1](x).to(x.dtype))
return out, pos
def build_backbone(args):
position_embedding = build_position_encoding(args)
train_backbone = args.lr_backbone > 0
return_interm_layers = args.masks
if args.backbone == 'dino_v2':
backbone = DINOv2BackBone()
else:
assert args.backbone in ['resnet18', 'resnet34']
backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
model = Joiner(backbone, position_embedding)
model.num_channels = backbone.num_channels
return model

302
act/detr/models/detr_vae.py

@ -0,0 +1,302 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DETR model and criterion classes.
"""
import torch
from torch import nn
from torch.autograd import Variable
from .backbone import build_backbone
from .transformer import build_transformer, TransformerEncoder, TransformerEncoderLayer
import numpy as np
import time
import IPython
e = IPython.embed
def reparametrize(mu, logvar):
std = logvar.div(2).exp()
eps = Variable(std.data.new(std.size()).normal_())
return mu + std * eps
def get_sinusoid_encoding_table(n_position, d_hid):
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
class DETRVAE(nn.Module):
""" This is the DETR module that performs object detection """
def __init__(self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names):
""" Initializes the model.
Parameters:
backbones: torch module of the backbone to be used. See backbone.py
transformer: torch module of the transformer architecture. See transformer.py
state_dim: robot state dimension of the environment
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For COCO, we recommend 100 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
"""
super().__init__()
self.num_queries = num_queries
self.camera_names = camera_names
self.transformer = transformer
self.encoder = encoder
hidden_dim = transformer.d_model
self.action_head = nn.Linear(hidden_dim, action_dim)
self.is_pad_head = nn.Linear(hidden_dim, 1)
self.query_embed = nn.Embedding(num_queries, hidden_dim)
if backbones is not None:
self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
self.backbones = nn.ModuleList(backbones)
self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
else:
raise NotImplementedError
# input_dim = 14 + 7 # robot_state + env_state
self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
self.input_proj_env_state = nn.Linear(7, hidden_dim)
self.pos = torch.nn.Embedding(2, hidden_dim)
self.backbones = None
# encoder extra parameters
self.latent_dim = 32 # final size of latent z # TODO tune
self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding
self.encoder_action_proj = nn.Linear(action_dim, hidden_dim) # project action to embedding
self.encoder_joint_proj = nn.Linear(state_dim, hidden_dim) # project qpos to embedding
self.latent_proj = nn.Linear(hidden_dim, self.latent_dim*2) # project hidden state to latent std, var
self.register_buffer('pos_table', get_sinusoid_encoding_table(1+1+num_queries, hidden_dim)) # [CLS], qpos, a_seq
# decoder extra parameters
self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding
self.additional_pos_embed = nn.Embedding(2, hidden_dim) # learned position embedding for proprio and latent
def forward(self, qpos, image, env_state, actions=None, is_pad=None):
"""
qpos: batch, qpos_dim
image: batch, num_cam, channel, height, width
env_state: None
actions: batch, seq, action_dim
"""
is_training = actions is not None # train or val
bs, _ = qpos.shape
### Obtain latent z from action sequence
if is_training:
# project action sequence to embedding dim, and concat with a CLS token
action_embed = self.encoder_action_proj(actions) # (bs, seq, hidden_dim)
qpos_embed = self.encoder_joint_proj(qpos) # (bs, hidden_dim)
qpos_embed = torch.unsqueeze(qpos_embed, axis=1) # (bs, 1, hidden_dim)
cls_embed = self.cls_embed.weight # (1, hidden_dim)
cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim)
encoder_input = torch.cat([cls_embed, qpos_embed, action_embed], axis=1) # (bs, seq+1, hidden_dim)
encoder_input = encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim)
# do not mask cls token
cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding
is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1) # (bs, seq+1)
# obtain position embedding
pos_embed = self.pos_table.clone().detach()
pos_embed = pos_embed.permute(1, 0, 2) # (seq+1, 1, hidden_dim)
# query model
encoder_output = self.encoder(encoder_input, pos=pos_embed, src_key_padding_mask=is_pad)
encoder_output = encoder_output[0] # take cls output only
latent_info = self.latent_proj(encoder_output)
mu = latent_info[:, :self.latent_dim]
logvar = latent_info[:, self.latent_dim:]
latent_sample = reparametrize(mu, logvar)
latent_input = self.latent_out_proj(latent_sample)
else:
mu = logvar = None
latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device)
latent_input = self.latent_out_proj(latent_sample)
if self.backbones is not None:
# Image observation features and position embeddings
all_cam_features = []
all_cam_pos = []
featuress, poss = self.backbones[0](image.flatten(0, 1)) # HARDCODED
featuress = featuress[0].view(image.shape[0], 2, 384, 16, 22) # take the last layer feature
pos = poss[0]
for cam_id, cam_name in enumerate(self.camera_names):
# start = time.time()
# import ipdb; ipdb.set_trace()
features = featuress[:, cam_id] # HARDCODED
# features, pos = self.backbones[cam_id](image[:, cam_id]) # HARDCODED
# print("Time for 1 backbone: ", time.time() - start, image.shape)
# features = features[0] # take the last layer feature
# pos = pos[0]
all_cam_features.append(self.input_proj(features))
all_cam_pos.append(pos/2+ cam_id - 0.5)
# break
# for cam_id, cam_name in enumerate(self.camera_names):
# features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED
# features = features[0] # take the last layer feature
# pos = pos[0]
# all_cam_features.append(self.input_proj(features))
# all_cam_pos.append(pos)
# proprioception features
proprio_input = self.input_proj_robot_state(qpos)
# fold camera dimension into width dimension
src = torch.cat(all_cam_features, axis=3)
pos = torch.cat(all_cam_pos, axis=3)
hs = self.transformer(src, None, self.query_embed.weight, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0]
else:
raise NotImplementedError
qpos = self.input_proj_robot_state(qpos)
env_state = self.input_proj_env_state(env_state)
transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2
hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0]
a_hat = self.action_head(hs)
is_pad_hat = self.is_pad_head(hs)
return a_hat, is_pad_hat, [mu, logvar]
class CNNMLP(nn.Module):
def __init__(self, backbones, state_dim, camera_names):
""" Initializes the model.
Parameters:
backbones: torch module of the backbone to be used. See backbone.py
transformer: torch module of the transformer architecture. See transformer.py
state_dim: robot state dimension of the environment
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For COCO, we recommend 100 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
"""
super().__init__()
self.camera_names = camera_names
self.action_head = nn.Linear(1000, state_dim) # TODO add more
if backbones is not None:
self.backbones = nn.ModuleList(backbones)
backbone_down_projs = []
for backbone in backbones:
down_proj = nn.Sequential(
nn.Conv2d(backbone.num_channels, 128, kernel_size=5),
nn.Conv2d(128, 64, kernel_size=5),
nn.Conv2d(64, 32, kernel_size=5)
)
backbone_down_projs.append(down_proj)
self.backbone_down_projs = nn.ModuleList(backbone_down_projs)
mlp_in_dim = 768 * len(backbones) + 14
self.mlp = mlp(input_dim=mlp_in_dim, hidden_dim=1024, output_dim=14, hidden_depth=2)
else:
raise NotImplementedError
def forward(self, qpos, image, env_state, actions=None):
"""
qpos: batch, qpos_dim
image: batch, num_cam, channel, height, width
env_state: None
actions: batch, seq, action_dim
"""
is_training = actions is not None # train or val
bs, _ = qpos.shape
# Image observation features and position embeddings
all_cam_features = []
for cam_id, cam_name in enumerate(self.camera_names):
features, pos = self.backbones[cam_id](image[:, cam_id])
features = features[0] # take the last layer feature
pos = pos[0] # not used
all_cam_features.append(self.backbone_down_projs[cam_id](features))
# flatten everything
flattened_features = []
for cam_feature in all_cam_features:
flattened_features.append(cam_feature.reshape([bs, -1]))
flattened_features = torch.cat(flattened_features, axis=1) # 768 each
features = torch.cat([flattened_features, qpos], axis=1) # qpos: 14
a_hat = self.mlp(features)
return a_hat
def mlp(input_dim, hidden_dim, output_dim, hidden_depth):
if hidden_depth == 0:
mods = [nn.Linear(input_dim, output_dim)]
else:
mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)]
for i in range(hidden_depth - 1):
mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)]
mods.append(nn.Linear(hidden_dim, output_dim))
trunk = nn.Sequential(*mods)
return trunk
def build_encoder(args):
d_model = args.hidden_dim # 256
dropout = args.dropout # 0.1
nhead = args.nheads # 8
dim_feedforward = args.dim_feedforward # 2048
num_encoder_layers = args.enc_layers # 4 # TODO shared with VAE decoder
normalize_before = args.pre_norm # False
activation = "relu"
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
return encoder
def build(args):
state_dim = args.state_dim
action_dim = args.action_dim
# From state
# backbone = None # from state for now, no need for conv nets
# From image
backbones = []
# backbone = build_backbone(args)
# backbones.append(backbone)
# for _ in args.camera_names:
backbone = build_backbone(args)
backbones.append(backbone)
transformer = build_transformer(args)
encoder = build_encoder(args)
model = DETRVAE(
backbones,
transformer,
encoder,
state_dim=state_dim,
action_dim=action_dim,
num_queries=args.num_queries,
camera_names=args.camera_names,
)
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("number of parameters: %.2fM" % (n_parameters/1e6,))
return model
def build_cnnmlp(args):
state_dim = 14 # TODO hardcode
# From state
# backbone = None # from state for now, no need for conv nets
# From image
backbones = []
for _ in args.camera_names:
backbone = build_backbone(args)
backbones.append(backbone)
model = CNNMLP(
backbones,
state_dim=state_dim,
camera_names=args.camera_names,
)
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("number of parameters: %.2fM" % (n_parameters/1e6,))
return model

93
act/detr/models/position_encoding.py

@ -0,0 +1,93 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
from util.misc import NestedTensor
import IPython
e = IPython.embed
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, tensor):
x = tensor
# mask = tensor_list.mask
# assert mask is not None
# not_mask = ~mask
not_mask = torch.ones_like(x[0, [0]])
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class PositionEmbeddingLearned(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, num_pos_feats=256):
super().__init__()
self.row_embed = nn.Embedding(50, num_pos_feats)
self.col_embed = nn.Embedding(50, num_pos_feats)
self.reset_parameters()
def reset_parameters(self):
nn.init.uniform_(self.row_embed.weight)
nn.init.uniform_(self.col_embed.weight)
def forward(self, tensor_list: NestedTensor):
x = tensor_list.tensors
h, w = x.shape[-2:]
i = torch.arange(w, device=x.device)
j = torch.arange(h, device=x.device)
x_emb = self.col_embed(i)
y_emb = self.row_embed(j)
pos = torch.cat([
x_emb.unsqueeze(0).repeat(h, 1, 1),
y_emb.unsqueeze(1).repeat(1, w, 1),
], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
return pos
def build_position_encoding(args):
N_steps = args.hidden_dim // 2
if args.position_embedding in ('v2', 'sine'):
# TODO find a better way of exposing other arguments
position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
elif args.position_embedding in ('v3', 'learned'):
position_embedding = PositionEmbeddingLearned(N_steps)
else:
raise ValueError(f"not supported {args.position_embedding}")
return position_embedding

314
act/detr/models/transformer.py

@ -0,0 +1,314 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DETR Transformer class.
Copy-paste from torch.nn.Transformer with modifications:
* positional encodings are passed in MHattention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
"""
import copy
from typing import Optional, List
import torch
import torch.nn.functional as F
from torch import nn, Tensor
import IPython
e = IPython.embed
class Transformer(nn.Module):
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False,
return_intermediate_dec=False):
super().__init__()
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
decoder_norm = nn.LayerNorm(d_model)
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
return_intermediate=return_intermediate_dec)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, mask, query_embed, pos_embed, latent_input=None, proprio_input=None, additional_pos_embed=None):
# TODO flatten only when input has H and W
if len(src.shape) == 4: # has H and W
# flatten NxCxHxW to HWxNxC
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1)
query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
# mask = mask.flatten(1)
additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim
pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
addition_input = torch.stack([latent_input, proprio_input], axis=0)
src = torch.cat([addition_input, src], axis=0)
else:
assert len(src.shape) == 3
# flatten NxHWxC to HWxNxC
bs, hw, c = src.shape
src = src.permute(1, 0, 2)
pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1)
query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
tgt = torch.zeros_like(query_embed)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
pos=pos_embed, query_pos=query_embed)
hs = hs.transpose(1, 2)
return hs
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
output = src
for layer in self.layers:
output = layer(output, src_mask=mask,
src_key_padding_mask=src_key_padding_mask, pos=pos)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoder(nn.Module):
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self, tgt, memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
output = tgt
intermediate = []
for layer in self.layers:
output = layer(output, memory, tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
pos=pos, query_pos=query_pos)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
intermediate.pop()
intermediate.append(output)
if self.return_intermediate:
return torch.stack(intermediate)
return output.unsqueeze(0)
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(src, pos)
src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
def forward_pre(self, src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
src = src + self.dropout2(src2)
return src
def forward(self, src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout3(tgt2)
tgt = self.norm3(tgt)
return tgt
def forward_pre(self, tgt, memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm1(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout1(tgt2)
tgt2 = self.norm2(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout2(tgt2)
tgt2 = self.norm3(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout3(tgt2)
return tgt
def forward(self, tgt, memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, tgt_mask, memory_mask,
tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def build_transformer(args):
return Transformer(
d_model=args.hidden_dim,
dropout=args.dropout,
nhead=args.nheads,
dim_feedforward=args.dim_feedforward,
num_encoder_layers=args.enc_layers,
num_decoder_layers=args.dec_layers,
normalize_before=args.pre_norm,
return_intermediate_dec=True,
)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")

10
act/detr/setup.py

@ -0,0 +1,10 @@
from distutils.core import setup
from setuptools import find_packages
setup(
name='detr',
version='0.0.0',
packages=find_packages(),
license='MIT License',
long_description=open('README.md').read(),
)

1
act/detr/util/__init__.py

@ -0,0 +1 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

88
act/detr/util/box_ops.py

@ -0,0 +1,88 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Utilities for bounding box manipulation and GIoU.
"""
import torch
from torchvision.ops.boxes import box_area
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def box_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.unbind(-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2,
(x1 - x0), (y1 - y0)]
return torch.stack(b, dim=-1)
# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/
The boxes should be in [x0, y0, x1, y1] format
Returns a [N, M] pairwise matrix, where N = len(boxes1)
and M = len(boxes2)
"""
# degenerate boxes gives inf / nan results
# so do an early check
assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
iou, union = box_iou(boxes1, boxes2)
lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
wh = (rb - lt).clamp(min=0) # [N,M,2]
area = wh[:, :, 0] * wh[:, :, 1]
return iou - (area - union) / area
def masks_to_boxes(masks):
"""Compute the bounding boxes around the provided masks
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
Returns a [N, 4] tensors, with the boxes in xyxy format
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device)
h, w = masks.shape[-2:]
y = torch.arange(0, h, dtype=torch.float)
x = torch.arange(0, w, dtype=torch.float)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_max = x_mask.flatten(1).max(-1)[0]
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_max = y_mask.flatten(1).max(-1)[0]
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
return torch.stack([x_min, y_min, x_max, y_max], 1)

468
act/detr/util/misc.py

@ -0,0 +1,468 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Misc functions, including distributed helpers.
Mostly copy-paste from torchvision references.
"""
import os
import subprocess
import time
from collections import defaultdict, deque
import datetime
import pickle
from packaging import version
from typing import Optional, List
import torch
import torch.distributed as dist
from torch import Tensor
# needed due to empty tensor bug in pytorch and torchvision 0.5
import torchvision
if version.parse(torchvision.__version__) < version.parse('0.7'):
from torchvision.ops import _new_empty_tensor
from torchvision.ops.misc import _output_size
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=20, fmt=None):
if fmt is None:
fmt = "{median:.4f} ({global_avg:.4f})"
self.deque = deque(maxlen=window_size)
self.total = 0.0
self.count = 0
self.fmt = fmt
def update(self, value, n=1):
self.deque.append(value)
self.count += n
self.total += value * n
def synchronize_between_processes(self):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
@property
def median(self):
d = torch.tensor(list(self.deque))
return d.median().item()
@property
def avg(self):
d = torch.tensor(list(self.deque), dtype=torch.float32)
return d.mean().item()
@property
def global_avg(self):
return self.total / self.count
@property
def max(self):
return max(self.deque)
@property
def value(self):
return self.deque[-1]
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
def all_gather(data):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors)
Args:
data: any picklable object
Returns:
list[data]: list of data gathered from each rank
"""
world_size = get_world_size()
if world_size == 1:
return [data]
# serialized to a Tensor
buffer = pickle.dumps(data)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to("cuda")
# obtain Tensor size of each rank
local_size = torch.tensor([tensor.numel()], device="cuda")
size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
dist.all_gather(size_list, local_size)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# receiving Tensor from all ranks
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
tensor_list = []
for _ in size_list:
tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
if local_size != max_size:
padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
tensor = torch.cat((tensor, padding), dim=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def reduce_dict(input_dict, average=True):
"""
Args:
input_dict (dict): all the values will be reduced
average (bool): whether to do average or sum
Reduce the values in the dictionary from all processes so that all processes
have the averaged results. Returns a dict with the same fields as
input_dict, after reduction.
"""
world_size = get_world_size()
if world_size < 2:
return input_dict
with torch.no_grad():
names = []
values = []
# sort the keys so that they are consistent across processes
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = torch.stack(values, dim=0)
dist.all_reduce(values)
if average:
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
class MetricLogger(object):
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
def update(self, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
self.meters[k].update(v)
def __getattr__(self, attr):
if attr in self.meters:
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
for meter in self.meters.values():
meter.synchronize_between_processes()
def add_meter(self, name, meter):
self.meters[name] = meter
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
yield obj
iter_time.update(time.time() - end)
if i % print_freq == 0 or i == len(iterable) - 1:
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
def get_sha():
cwd = os.path.dirname(os.path.abspath(__file__))
def _run(command):
return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
sha = 'N/A'
diff = "clean"
branch = 'N/A'
try:
sha = _run(['git', 'rev-parse', 'HEAD'])
subprocess.check_output(['git', 'diff'], cwd=cwd)
diff = _run(['git', 'diff-index', 'HEAD'])
diff = "has uncommited changes" if diff else "clean"
branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
except Exception:
pass
message = f"sha: {sha}, status: {diff}, branch: {branch}"
return message
def collate_fn(batch):
batch = list(zip(*batch))
batch[0] = nested_tensor_from_tensor_list(batch[0])
return tuple(batch)
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device):
# type: (Device) -> NestedTensor # noqa
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
assert mask is not None
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
# TODO make this more general
if tensor_list[0].ndim == 3:
if torchvision._is_tracing():
# nested_tensor_from_tensor_list() does not export well to ONNX
# call _onnx_nested_tensor_from_tensor_list() instead
return _onnx_nested_tensor_from_tensor_list(tensor_list)
# TODO make it support different-sized images
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
batch_shape = [len(tensor_list)] + max_size
b, c, h, w = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], :img.shape[2]] = False
else:
raise ValueError('not supported')
return NestedTensor(tensor, mask)
# _onnx_nested_tensor_from_tensor_list() is an implementation of
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
max_size = []
for i in range(tensor_list[0].dim()):
max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
max_size.append(max_size_i)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# m[: img.shape[1], :img.shape[2]] = False
# which is not yet supported in onnx
padded_imgs = []
padded_masks = []
for img in tensor_list:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
padded_imgs.append(padded_img)
m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
padded_masks.append(padded_mask.to(torch.bool))
tensor = torch.stack(padded_imgs)
mask = torch.stack(padded_masks)
return NestedTensor(tensor, mask=mask)
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def save_on_master(*args, **kwargs):
if is_main_process():
torch.save(*args, **kwargs)
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
@torch.no_grad()
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
if target.numel() == 0:
return [torch.zeros([], device=output.device)]
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
# type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
"""
Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
This will eventually be supported natively by PyTorch, and this
class can go away.
"""
if version.parse(torchvision.__version__) < version.parse('0.7'):
if input.numel() > 0:
return torch.nn.functional.interpolate(
input, size, scale_factor, mode, align_corners
)
output_shape = _output_size(2, input, size, scale_factor)
output_shape = list(input.shape[:-2]) + list(output_shape)
return _new_empty_tensor(input, output_shape)
else:
return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)

107
act/detr/util/plot_utils.py

@ -0,0 +1,107 @@
"""
Plotting utilities to visualize training logs.
"""
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path, PurePath
def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
'''
Function to plot specific fields from training log(s). Plots both training and test results.
:: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
- fields = which results to plot from each log file - plots both training and test for each field.
- ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
- log_name = optional, name of log file if different than default 'log.txt'.
:: Outputs - matplotlib plots of results in fields, color coded for each log file.
- solid lines are training results, dashed lines are test results.
'''
func_name = "plot_utils.py::plot_logs"
# verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
# convert single Path to list to avoid 'not iterable' error
if not isinstance(logs, list):
if isinstance(logs, PurePath):
logs = [logs]
print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
else:
raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
Expect list[Path] or single Path obj, received {type(logs)}")
# Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
for i, dir in enumerate(logs):
if not isinstance(dir, PurePath):
raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
if not dir.exists():
raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
# verify log_name exists
fn = Path(dir / log_name)
if not fn.exists():
print(f"-> missing {log_name}. Have you gotten to Epoch 1 in training?")
print(f"--> full path of missing log file: {fn}")
return
# load log file(s) and plot
dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
for j, field in enumerate(fields):
if field == 'mAP':
coco_eval = pd.DataFrame(
np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
).ewm(com=ewm_col).mean()
axs[j].plot(coco_eval, c=color)
else:
df.interpolate().ewm(com=ewm_col).mean().plot(
y=[f'train_{field}', f'test_{field}'],
ax=axs[j],
color=[color] * 2,
style=['-', '--']
)
for ax, field in zip(axs, fields):
ax.legend([Path(p).name for p in logs])
ax.set_title(field)
def plot_precision_recall(files, naming_scheme='iter'):
if naming_scheme == 'exp_id':
# name becomes exp_id
names = [f.parts[-3] for f in files]
elif naming_scheme == 'iter':
names = [f.stem for f in files]
else:
raise ValueError(f'not supported {naming_scheme}')
fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
data = torch.load(f)
# precision is n_iou, n_points, n_cat, n_area, max_det
precision = data['precision']
recall = data['params'].recThrs
scores = data['scores']
# take precision for all classes, all areas and 100 detections
precision = precision[0, :, :, 0, -1].mean(1)
scores = scores[0, :, :, 0, -1].mean(1)
prec = precision.mean()
rec = data['recall'][0, :, 0, -1].mean()
print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
f'score={scores.mean():0.3f}, ' +
f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
)
axs[0].plot(recall, precision, c=color)
axs[1].plot(recall, scores, c=color)
axs[0].set_title('Precision / Recall')
axs[0].legend(names)
axs[1].set_title('Scores / Recall')
axs[1].legend(names)
return fig, axs

367
act/imitate_episodes.py

@ -0,0 +1,367 @@
import torch
import numpy as np
import os
import pickle
import argparse
import matplotlib.pyplot as plt
from copy import deepcopy
from tqdm import tqdm
from einops import rearrange
# from .constants import DT
# from .constants import PUPPET_GRIPPER_JOINT_OPEN
from utils import load_data # data functions
from utils import compute_dict_mean, set_seed, detach_dict, parse_id, find_all_ckpt # helper functions
from policy import ACTPolicy, CNNMLPPolicy
# from .visualize_episodes import save_videos
import wandb
# from sim_env import BOX_POSE
# from constants import SIM_TASK_CONFIGS
import IPython
e = IPython.embed
import time
from itertools import repeat
def repeater(data_loader):
epoch = 0
for loader in repeat(data_loader):
for data in loader:
yield data
print(f'Epoch {epoch} done')
epoch += 1
from pathlib import Path
def main(args):
set_seed(1)
# command line parameters
is_eval = args['eval']
policy_class = args['policy_class']
onscreen_render = args['onscreen_render']
# task_name = args['task_name']
batch_size_train = args['batch_size']
batch_size_val = args['batch_size']
num_epochs = args['num_epochs']
# get task parameters
# is_sim = task_name[:4] == 'sim_'
task_dir, task_name = parse_id(RECORD_DIR, args['taskid'])
dataset_dir = (Path(task_dir) / 'processed').resolve()
ckpt_dir = (LOG_DIR / task_name / args['exptid']).resolve()
print("*"*20)
print(f"Task name: {task_name}")
print("*"*20)
# print(f"Checkpoint dir: {ckpt_dir}")
# task_config = SIM_TASK_CONFIGS[task_name]
# dataset_dir = task_config['dataset_dir']
# ckpt_dir = task_config['ckpt_dir']
# num_episodes = task_config['num_episodes']
# episode_len = task_config['episode_len']
camera_names = ['left', 'right']
# fixed parameters
state_dim = 26
action_dim = 28
lr_backbone = 1e-5
backbone = 'dino_v2'
if policy_class == 'ACT':
enc_layers = 4
dec_layers = 7
nheads = 8
policy_config = {'lr': args['lr'],
'num_queries': args['chunk_size'],
'kl_weight': args['kl_weight'],
'hidden_dim': args['hidden_dim'],
'dim_feedforward': args['dim_feedforward'],
'lr_backbone': lr_backbone,
'backbone': backbone,
'enc_layers': enc_layers,
'dec_layers': dec_layers,
'nheads': nheads,
'camera_names': camera_names,
'state_dim': state_dim,
'action_dim': action_dim,
'qpos_noise_std': args['qpos_noise_std'],
}
elif policy_class == 'CNNMLP':
policy_config = {'lr': args['lr'], 'lr_backbone': lr_backbone, 'backbone' : backbone, 'num_queries': 1,
'camera_names': camera_names,}
else:
raise NotImplementedError
config = {
'num_epochs': num_epochs,
'ckpt_dir': ckpt_dir,
# 'episode_len': episode_len,
'state_dim': state_dim,
'action_dim': action_dim,
'lr': args['lr'],
'policy_class': policy_class,
'onscreen_render': onscreen_render,
'policy_config': policy_config,
# 'task_name': task_name,
'seed': args['seed'],
'temporal_agg': args['temporal_agg'],
'camera_names': camera_names,
# 'real_robot': not is_sim
'resumeid': args['resumeid'],
'resume_ckpt': args['resume_ckpt'],
'task_name': task_name,
'exptid': args['exptid'],
}
mode = "disabled" if args["no_wandb"] or args["save_jit"] else "online"
wandb.init(project="television", name=args['exptid'], group=task_name, entity="cxx", mode=mode, dir="../data/logs")
wandb.config.update(config)
train_dataloader, val_dataloader, stats, _ = load_data(dataset_dir, camera_names, batch_size_train, batch_size_val)
# save dataset stats
if not os.path.isdir(ckpt_dir):
os.makedirs(ckpt_dir)
stats_path = os.path.join(ckpt_dir, f'dataset_stats.pkl')
with open(stats_path, 'wb') as f:
pickle.dump(stats, f)
if args['save_jit']:
save_jit(config)
return
best_ckpt_info = train_bc(train_dataloader, val_dataloader, config)
best_epoch, min_val_loss, best_state_dict = best_ckpt_info
# save best checkpoint
ckpt_path = os.path.join(ckpt_dir, f'policy_best.ckpt')
torch.save(best_state_dict, ckpt_path)
print(f'Best ckpt, val loss {min_val_loss:.6f} @ epoch{best_epoch}')
wandb.finish()
def make_policy(policy_class, policy_config):
if policy_class == 'ACT':
policy = ACTPolicy(policy_config)
elif policy_class == 'CNNMLP':
policy = CNNMLPPolicy(policy_config)
else:
raise NotImplementedError
return policy
def make_optimizer(policy_class, policy):
if policy_class == 'ACT':
optimizer = policy.configure_optimizers()
elif policy_class == 'CNNMLP':
optimizer = policy.configure_optimizers()
else:
raise NotImplementedError
return optimizer
def get_image(ts, camera_names):
curr_images = []
for cam_name in camera_names:
curr_image = rearrange(ts.observation['images'][cam_name], 'h w c -> c h w')
curr_images.append(curr_image)
curr_image = np.stack(curr_images, axis=0)
curr_image = torch.from_numpy(curr_image / 255.0).float().cuda().unsqueeze(0)
return curr_image
def forward_pass(data, policy):
image_data, qpos_data, action_data, is_pad = data
image_data, qpos_data, action_data, is_pad = image_data.cuda(), qpos_data.cuda(), action_data.cuda(), is_pad.cuda()
return policy(qpos_data, image_data, action_data, is_pad) # TODO remove None
def train_bc(train_dataloader, val_dataloader, config):
num_epochs = config['num_epochs']
ckpt_dir = config['ckpt_dir']
seed = config['seed']
policy_class = config['policy_class']
policy_config = config['policy_config']
set_seed(seed)
policy = make_policy(policy_class, policy_config)
policy.cuda()
optimizer = make_optimizer(policy_class, policy)
if config['resumeid']:
exp_dir, exp_name = parse_id((LOG_DIR / config['task_name']).resolve(), config['resumeid'])
policy, _, _ = load_ckpt(policy, exp_dir, config['resume_ckpt'])
# if config['resume_ckpt']:
# ckpt_name = f"policy_epoch_{config['resume_ckpt']}_seed_0.ckpt"
# else:
# ckpt_name, _ = find_all_ckpt(exp_dir)#f"policy_last.ckpt"
# resume_path = (Path(exp_dir) / ckpt_name).resolve()
# print(f"Resuming from {resume_path}")
# checkpoint = torch.load(resume_path)
# policy.load_state_dict(checkpoint)
# train_history = []
# validation_history = []
min_val_loss = np.inf
best_ckpt_info = None
train_dataloader = repeater(train_dataloader)
for epoch in tqdm(range(num_epochs)):
print(f'\nEpoch {epoch}')
if epoch % 500 == 0:
# validation
with torch.inference_mode():
policy.eval()
validation_dicts = []
for batch_idx, data in enumerate(val_dataloader):
forward_dict = forward_pass(data, policy)
validation_dicts.append(forward_dict)
if batch_idx > 20:
break
validation_summary = compute_dict_mean(validation_dicts)
epoch_val_loss = validation_summary['loss']
if epoch_val_loss < min_val_loss:
min_val_loss = epoch_val_loss
best_ckpt_info = (epoch, min_val_loss, deepcopy(policy.state_dict()))
for k in list(validation_summary.keys()):
validation_summary[f'val/{k}'] = validation_summary.pop(k)
wandb.log(validation_summary, step=epoch)
print(f'Val loss: {epoch_val_loss:.5f}')
summary_string = ''
for k, v in validation_summary.items():
summary_string += f'{k}: {v.item():.3f} '
print(summary_string)
# training
policy.train()
optimizer.zero_grad()
data = next(train_dataloader)
forward_dict = forward_pass(data, policy)
# backward
loss = forward_dict['loss']
loss.backward()
optimizer.step()
optimizer.zero_grad()
epoch_summary = detach_dict(forward_dict)
# epoch_summary = compute_dict_mean(train_history[(batch_idx+1)*epoch:(batch_idx+1)*(epoch+1)])
epoch_train_loss = epoch_summary['loss']
print(f'Train loss: {epoch_train_loss:.5f}')
summary_string = ''
for k, v in epoch_summary.items():
summary_string += f'{k}: {v.item():.3f} '
print(summary_string)
wandb.log(epoch_summary, step=epoch)
if epoch % 1000 == 0 and epoch >= 1000:
ckpt_path = os.path.join(ckpt_dir, f'policy_epoch_{epoch}_seed_{seed}.ckpt')
torch.save(policy.state_dict(), ckpt_path)
# plot_history(train_history, validation_history, epoch, ckpt_dir, seed)
ckpt_path = os.path.join(ckpt_dir, f'policy_last.ckpt')
torch.save(policy.state_dict(), ckpt_path)
best_epoch, min_val_loss, best_state_dict = best_ckpt_info
ckpt_path = os.path.join(ckpt_dir, f'policy_epoch_{best_epoch}_seed_{seed}.ckpt')
torch.save(best_state_dict, ckpt_path)
print(f'Training finished:\nSeed {seed}, val loss {min_val_loss:.6f} at epoch {best_epoch}')
# save training curves
# plot_history(train_history, validation_history, num_epochs, ckpt_dir, seed)
return best_ckpt_info
def plot_history(train_history, validation_history, num_epochs, ckpt_dir, seed):
# save training curves
for key in train_history[0]:
plot_path = os.path.join(ckpt_dir, f'train_val_{key}_seed_{seed}.png')
plt.figure()
train_values = [summary[key].item() for summary in train_history]
val_values = [summary[key].item() for summary in validation_history]
plt.plot(np.linspace(0, num_epochs-1, len(train_history)), train_values, label='train')
plt.plot(np.linspace(0, num_epochs-1, len(validation_history)), val_values, label='validation')
# plt.ylim([-0.1, 1])
plt.tight_layout()
plt.legend()
plt.title(key)
plt.savefig(plot_path)
print(f'Saved plots to {ckpt_dir}')
def load_ckpt(policy, exp_dir, ckpt_name):
if ckpt_name:
epoch = ckpt_name
ckpt_name = f"policy_epoch_{ckpt_name}_seed_0.ckpt"
else:
ckpt_name, epoch = find_all_ckpt(exp_dir)#f"policy_last.ckpt"
resume_path = (Path(exp_dir) / ckpt_name).resolve()
print("*"*20)
print(f"Resuming from {resume_path}")
print("*"*20)
policy.load_state_dict(torch.load(resume_path))
return policy, ckpt_name, epoch
def save_jit(config):
# ckpt_dir = config['ckpt_dir']
policy_class = config['policy_class']
policy_config = config['policy_config']
exp_dir, exp_name = parse_id((LOG_DIR / config['task_name']).resolve(), config['exptid'])
policy = make_policy(policy_class, policy_config)
policy.cuda()
policy, ckpt_name, epoch = load_ckpt(policy, exp_dir, config['resume_ckpt'])
policy.eval()
image_data = torch.rand((1, 2, 3, 480, 640), device='cuda')
qpos_data = torch.rand((1, config['state_dim']), device='cuda')
input_data = (qpos_data, image_data)
traced_policy = torch.jit.trace(policy, input_data)
save_path = os.path.join(exp_dir, f"traced_jit_{epoch}.pt")
traced_policy.save(save_path)
print("Saved traced actor at ", save_path)
new_policy = torch.jit.load(save_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--eval', action='store_true')
parser.add_argument('--onscreen_render', action='store_true')
parser.add_argument('--policy_class', action='store', type=str, help='policy_class, capitalize', required=True)
# parser.add_argument('--task_name', action='store', type=str, help='task_name', required=True)
parser.add_argument('--batch_size', action='store', type=int, help='batch_size', required=True)
parser.add_argument('--seed', action='store', type=int, help='seed', required=True)
parser.add_argument('--num_epochs', action='store', type=int, help='num_epochs', required=True)
parser.add_argument('--lr', action='store', type=float, help='lr', required=True)
parser.add_argument('--qpos_noise_std', action='store', default=0, type=float, help='lr', required=False)
# for ACT
parser.add_argument('--kl_weight', action='store', type=int, help='KL Weight', required=False)
parser.add_argument('--chunk_size', action='store', type=int, help='chunk_size', required=False)
parser.add_argument('--hidden_dim', action='store', type=int, help='hidden_dim', required=False)
parser.add_argument('--dim_feedforward', action='store', type=int, help='dim_feedforward', required=False)
parser.add_argument('--temporal_agg', action='store_true')
parser.add_argument('--save_jit', action='store_true')
parser.add_argument('--no_wandb', action='store_true')
parser.add_argument('--resumeid', action='store', default="", type=str, help='resume id', required=False)
parser.add_argument('--resume_ckpt', action='store', default="", type=str, help='resume ckpt', required=False)
parser.add_argument('--taskid', action='store', type=str, help='task id', required=True)
parser.add_argument('--exptid', action='store', type=str, help='experiment id', required=True)
parser.add_argument('--source', choices=['self', 'ssd'], default='self')
args = vars(parser.parse_args())
if args['source'] == 'self':
current_dir = Path(__file__).parent.resolve()
else:
current_dir = Path("/media/cxx/Extreme Pro/human2robot/data/").resolve()
DATA_DIR = (current_dir.parent / 'data/').resolve()
RECORD_DIR = (DATA_DIR / 'recordings/').resolve()
LOG_DIR = (DATA_DIR / 'logs/').resolve()
# print(f"\nDATA dir: {DATA_DIR}")
main(args)

115
act/policy.py

@ -0,0 +1,115 @@
import torch.nn as nn
from torch.nn import functional as F
from torchvision.transforms import v2
import torch
from detr.main import build_ACT_model_and_optimizer, build_CNNMLP_model_and_optimizer
import IPython
e = IPython.embed
class ACTPolicy(nn.Module):
def __init__(self, args_override):
super().__init__()
model, optimizer = build_ACT_model_and_optimizer(args_override)
self.model = model # CVAE decoder
self.optimizer = optimizer
self.kl_weight = args_override['kl_weight']
self.qpos_noise_std = args_override['qpos_noise_std']
print(f'KL Weight {self.kl_weight}')
def __call__(self, qpos, image, actions=None, is_pad=None):
env_state = None
# normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225])
patch_h = 16
patch_w = 22
if actions is not None: # training time
# transform = v2.Compose([
# v2.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
# v2.RandomPerspective(distortion_scale=0.5),
# v2.RandomAffine(degrees=10, translate=(0.1,0.1), scale=(0.9,1.1)),
# v2.GaussianBlur(kernel_size=(9,9), sigma=(0.1,2.0)),
# v2.Normalize(
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225])
# ])
transform = v2.Compose([
v2.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
v2.RandomPerspective(distortion_scale=0.5),
v2.RandomAffine(degrees=10, translate=(0.1,0.1), scale=(0.9,1.1)),
v2.GaussianBlur(kernel_size=(9,9), sigma=(0.1,2.0)),
v2.Resize((patch_h * 14, patch_w * 14)),
# v2.CenterCrop((patch_h * 14, patch_w * 14)),
v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])
qpos += (self.qpos_noise_std**0.5)*torch.randn_like(qpos)
else: # inference time
transform = v2.Compose([
v2.Resize((patch_h * 14, patch_w * 14)),
# v2.CenterCrop((patch_h * 14, patch_w * 14)),
v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])
image = transform(image)
if actions is not None: # training time
actions = actions[:, :self.model.num_queries]
is_pad = is_pad[:, :self.model.num_queries]
a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar)
loss_dict = dict()
all_l1 = F.l1_loss(actions, a_hat, reduction='none')
l1 = (all_l1 * ~is_pad.unsqueeze(-1)).mean()
loss_dict['l1'] = l1
loss_dict['kl'] = total_kld[0]
loss_dict['loss'] = loss_dict['l1'] + loss_dict['kl'] * self.kl_weight
return loss_dict
else: # inference time
a_hat, _, (_, _) = self.model(qpos, image, env_state) # no action, sample from prior
return a_hat
def configure_optimizers(self):
return self.optimizer
class CNNMLPPolicy(nn.Module):
def __init__(self, args_override):
super().__init__()
model, optimizer = build_CNNMLP_model_and_optimizer(args_override)
self.model = model # decoder
self.optimizer = optimizer
def __call__(self, qpos, image, actions=None, is_pad=None):
env_state = None # TODO
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
image = normalize(image)
if actions is not None: # training time
actions = actions[:, 0]
a_hat = self.model(qpos, image, env_state, actions)
mse = F.mse_loss(actions, a_hat)
loss_dict = dict()
loss_dict['mse'] = mse
loss_dict['loss'] = loss_dict['mse']
return loss_dict
else: # inference time
a_hat = self.model(qpos, image, env_state) # no action, sample from prior
return a_hat
def configure_optimizers(self):
return self.optimizer
def kl_divergence(mu, logvar):
batch_size = mu.size(0)
assert batch_size != 0
if mu.data.ndimension() == 4:
mu = mu.view(mu.size(0), mu.size(1))
if logvar.data.ndimension() == 4:
logvar = logvar.view(logvar.size(0), logvar.size(1))
klds = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp())
total_kld = klds.sum(1).mean(0, True)
dimension_wise_kld = klds.mean(0)
mean_kld = klds.mean(1).mean(0, True)
return total_kld, dimension_wise_kld, mean_kld

286
act/utils.py

@ -0,0 +1,286 @@
import numpy as np
import torch
import os
import h5py
from torch.utils.data import TensorDataset, DataLoader
import time
import IPython
e = IPython.embed
from pathlib import Path
class EpisodicDataset(torch.utils.data.Dataset):
def __init__(self, episode_ids, dataset_dir, camera_names, norm_stats, episode_len, history_stack=0):
super(EpisodicDataset).__init__()
self.episode_ids = episode_ids
self.dataset_dir = dataset_dir
self.camera_names = camera_names
self.norm_stats = norm_stats
self.is_sim = None
self.max_pad_len = 200
action_str = 'qpos_action'
self.history_stack = history_stack
self.dataset_paths = []
self.roots = []
self.is_sims = []
self.original_action_shapes = []
self.states = []
self.image_dict = dict()
for cam_name in self.camera_names:
self.image_dict[cam_name] = []
self.actions = []
for i, episode_id in enumerate(self.episode_ids):
self.dataset_paths.append(os.path.join(self.dataset_dir, f'processed_episode_{episode_id}.hdf5'))
root = h5py.File(self.dataset_paths[i], 'r')
self.roots.append(root)
self.is_sims.append(root.attrs['sim'])
self.original_action_shapes.append(root[action_str].shape)
self.states.append(np.array(root['observation.state']))
for cam_name in self.camera_names:
self.image_dict[cam_name].append(root[f'observation.image.{cam_name}'])
self.actions.append(np.array(root[action_str]))
self.is_sim = self.is_sims[0]
self.episode_len = episode_len
self.cumulative_len = np.cumsum(self.episode_len)
# self.__getitem__(0) # initialize self.is_sim
# def __len__(self):
# return len(self.episode_ids)
def _locate_transition(self, index):
assert index < self.cumulative_len[-1]
episode_index = np.argmax(self.cumulative_len > index) # argmax returns first True index
start_ts = index - (self.cumulative_len[episode_index] - self.episode_len[episode_index])
return episode_index, start_ts
def __getitem__(self, ts_index):
sample_full_episode = False # hardcode
index, start_ts = self._locate_transition(ts_index)
original_action_shape = self.original_action_shapes[index]
episode_len = original_action_shape[0]
if sample_full_episode:
start_ts = 0
else:
start_ts = np.random.choice(episode_len)
# get observation at start_ts only
qpos = self.states[index][start_ts]
# qvel = root['/observations/qvel'][start_ts]
if self.history_stack > 0:
last_indices = np.maximum(0, np.arange(start_ts-self.history_stack, start_ts)).astype(int)
last_action = self.actions[index][last_indices, :]
image_dict = dict()
for cam_name in self.camera_names:
image_dict[cam_name] = self.image_dict[cam_name][index][start_ts]
# get all actions after and including start_ts
all_time_action = self.actions[index][:]
all_time_action_padded = np.zeros((self.max_pad_len+original_action_shape[0], original_action_shape[1]), dtype=np.float32)
all_time_action_padded[:episode_len] = all_time_action
all_time_action_padded[episode_len:] = all_time_action[-1]
padded_action = all_time_action_padded[start_ts:start_ts+self.max_pad_len]
real_len = episode_len - start_ts
is_pad = np.zeros(self.max_pad_len)
is_pad[real_len:] = 1
# new axis for different cameras
all_cam_images = []
for cam_name in self.camera_names:
all_cam_images.append(image_dict[cam_name])
all_cam_images = np.stack(all_cam_images, axis=0)
# construct observations
image_data = torch.from_numpy(all_cam_images)
qpos_data = torch.from_numpy(qpos).float()
action_data = torch.from_numpy(padded_action).float()
is_pad = torch.from_numpy(is_pad).bool()
if self.history_stack > 0:
last_action_data = torch.from_numpy(last_action).float()
# normalize image and change dtype to float
image_data = image_data / 255.0
action_data = (action_data - self.norm_stats["action_mean"]) / self.norm_stats["action_std"]
qpos_data = (qpos_data - self.norm_stats["qpos_mean"]) / self.norm_stats["qpos_std"]
if self.history_stack > 0:
last_action_data = (last_action_data - self.norm_stats['action_mean']) / self.norm_stats['action_std']
qpos_data = torch.cat((qpos_data, last_action_data.flatten()))
# print(f"qpos_data: {qpos_data.shape}, action_data: {action_data.shape}, image_data: {image_data.shape}, is_pad: {is_pad.shape}")
return image_data, qpos_data, action_data, is_pad
def get_norm_stats(dataset_dir, num_episodes):
action_str = 'qpos_action'
all_qpos_data = []
all_action_data = []
all_episode_len = []
for episode_idx in range(num_episodes):
dataset_path = os.path.join(dataset_dir, f'processed_episode_{episode_idx}.hdf5')
with h5py.File(dataset_path, 'r') as root:
qpos = root['observation.state'][()]
action = root[action_str][()]
all_qpos_data.append(torch.from_numpy(qpos))
all_action_data.append(torch.from_numpy(action))
all_episode_len.append(len(qpos))
all_qpos_data = torch.cat(all_qpos_data)
all_action_data = torch.cat(all_action_data)
all_action_data = all_action_data
# normalize action data
action_mean = all_action_data.mean(dim=0, keepdim=True) # (episode, timstep, action_dim)
action_std = all_action_data.std(dim=0, keepdim=True)
action_std = torch.clip(action_std, 1e-2, np.inf) # clipping
# normalize qpos data
qpos_mean = all_qpos_data.mean(dim=0, keepdim=True)
qpos_std = all_qpos_data.std(dim=0, keepdim=True)
qpos_std = torch.clip(qpos_std, 1e-2, np.inf) # clipping
stats = {"action_mean": action_mean.numpy().squeeze(), "action_std": action_std.numpy().squeeze(),
"qpos_mean": qpos_mean.numpy().squeeze(), "qpos_std": qpos_std.numpy().squeeze(),
"example_qpos": qpos}
return stats, all_episode_len
def find_all_processed_episodes(path):
episodes = [f for f in os.listdir(path)]
return episodes
def BatchSampler(batch_size, episode_len_l, sample_weights=None):
sample_probs = np.array(sample_weights) / np.sum(sample_weights) if sample_weights is not None else None
sum_dataset_len_l = np.cumsum([0] + [np.sum(episode_len) for episode_len in episode_len_l])
while True:
batch = []
for _ in range(batch_size):
episode_idx = np.random.choice(len(episode_len_l), p=sample_probs)
step_idx = np.random.randint(sum_dataset_len_l[episode_idx], sum_dataset_len_l[episode_idx + 1])
batch.append(step_idx)
yield batch
def load_data(dataset_dir, camera_names, batch_size_train, batch_size_val):
print(f'\nData from: {dataset_dir}\n')
all_eps = find_all_processed_episodes(dataset_dir)
num_episodes = len(all_eps)
# obtain train test split
train_ratio = 0.99
shuffled_indices = np.random.permutation(num_episodes)
train_indices = shuffled_indices[:int(train_ratio * num_episodes)]
val_indices = shuffled_indices[int(train_ratio * num_episodes):]
print(f"Train episodes: {len(train_indices)}, Val episodes: {len(val_indices)}")
# obtain normalization stats for qpos and action
norm_stats, all_episode_len = get_norm_stats(dataset_dir, num_episodes)
train_episode_len_l = [all_episode_len[i] for i in train_indices]
val_episode_len_l = [all_episode_len[i] for i in val_indices]
batch_sampler_train = BatchSampler(batch_size_train, train_episode_len_l)
batch_sampler_val = BatchSampler(batch_size_val, val_episode_len_l, None)
# construct dataset and dataloader
train_dataset = EpisodicDataset(train_indices, dataset_dir, camera_names, norm_stats, train_episode_len_l)
val_dataset = EpisodicDataset(val_indices, dataset_dir, camera_names, norm_stats, val_episode_len_l)
train_dataloader = DataLoader(train_dataset, batch_sampler=batch_sampler_train, pin_memory=True, num_workers=24, prefetch_factor=2)
val_dataloader = DataLoader(val_dataset, batch_sampler=batch_sampler_val, pin_memory=True, num_workers=16, prefetch_factor=2)
return train_dataloader, val_dataloader, norm_stats, train_dataset.is_sim
def sample_box_pose():
x_range = [0.0, 0.2]
y_range = [0.4, 0.6]
z_range = [0.05, 0.05]
ranges = np.vstack([x_range, y_range, z_range])
cube_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
cube_quat = np.array([1, 0, 0, 0])
return np.concatenate([cube_position, cube_quat])
def sample_insertion_pose():
# Peg
x_range = [0.1, 0.2]
y_range = [0.4, 0.6]
z_range = [0.05, 0.05]
ranges = np.vstack([x_range, y_range, z_range])
peg_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
peg_quat = np.array([1, 0, 0, 0])
peg_pose = np.concatenate([peg_position, peg_quat])
# Socket
x_range = [-0.2, -0.1]
y_range = [0.4, 0.6]
z_range = [0.05, 0.05]
ranges = np.vstack([x_range, y_range, z_range])
socket_position = np.random.uniform(ranges[:, 0], ranges[:, 1])
socket_quat = np.array([1, 0, 0, 0])
socket_pose = np.concatenate([socket_position, socket_quat])
return peg_pose, socket_pose
### helper functions
def compute_dict_mean(epoch_dicts):
result = {k: None for k in epoch_dicts[0]}
num_items = len(epoch_dicts)
for k in result:
value_sum = 0
for epoch_dict in epoch_dicts:
value_sum += epoch_dict[k]
result[k] = value_sum / num_items
return result
def detach_dict(d):
new_d = dict()
for k, v in d.items():
new_d[k] = v.detach()
return new_d
def set_seed(seed):
torch.manual_seed(seed)
np.random.seed(seed)
def parse_id(base_dir, prefix):
base_path = Path(base_dir)
# Ensure the base path exists and is a directory
if not base_path.exists() or not base_path.is_dir():
raise ValueError(f"The provided base directory does not exist or is not a directory: \n{base_path}")
# Loop through all subdirectories of the base path
for subfolder in base_path.iterdir():
if subfolder.is_dir() and subfolder.name.startswith(prefix):
return str(subfolder), subfolder.name
# If no matching subfolder is found
return None, None
def find_all_ckpt(base_dir, prefix="policy_epoch_"):
base_path = Path(base_dir)
# Ensure the base path exists and is a directory
if not base_path.exists() or not base_path.is_dir():
raise ValueError("The provided base directory does not exist or is not a directory.")
ckpt_files = []
for file in base_path.iterdir():
if file.is_file() and file.name.startswith(prefix):
ckpt_files.append(file.name)
# find latest ckpt
ckpt_files = sorted(ckpt_files, key=lambda x: int(x.split(prefix)[-1].split('_')[0]), reverse=True)
epoch = int(ckpt_files[0].split(prefix)[-1].split('_')[0])
return ckpt_files[0], epoch

79
assets/H1_5/README.md

@ -0,0 +1,79 @@
# Unitree H1 Description (URDF & MJCF)
## Overview
This package includes a streamlined robot description (URDF & MJCF) for the [Unitree H1](https://www.unitree.com/h1/), developed by [Unitree Robotics](https://www.unitree.com/).
<p align="center">
<img src="h1_5.png" width="500"/>
</p>
Unitree H1 have 51 DOFs:
```text
root [⚓] => /pelvis/
left_hip_yaw_joint [⚙+Z] => /left_hip_yaw_link/
left_hip_pitch_joint [⚙+Y] => /left_hip_pitch_link/
left_hip_roll_joint [⚙+X] => /left_hip_roll_link/
left_knee_joint [⚙+Y] => /left_knee_link/
left_ankle_pitch_joint [⚙+Y] => /left_ankle_pitch_link/
left_ankle_roll_joint [⚙+X] => /left_ankle_roll_link/
right_hip_yaw_joint [⚙+Z] => /right_hip_yaw_link/
right_hip_pitch_joint [⚙+Y] => /right_hip_pitch_link/
right_hip_roll_joint [⚙+X] => /right_hip_roll_link/
right_knee_joint [⚙+Y] => /right_knee_link/
right_ankle_pitch_joint [⚙+Y] => /right_ankle_pitch_link/
right_ankle_roll_joint [⚙+X] => /right_ankle_roll_link/
torso_joint [⚙+Z] => /torso_link/
left_shoulder_pitch_joint [⚙+Y] => /left_shoulder_pitch_link/
left_shoulder_roll_joint [⚙+X] => /left_shoulder_roll_link/
left_shoulder_yaw_joint [⚙+Z] => /left_shoulder_yaw_link/
left_elbow_pitch_joint [⚙+Y] => /left_elbow_pitch_link/
left_elbow_roll_joint [⚙+X] => /left_elbow_roll_link/
left_wrist_pitch_joint [⚙+Y] => /left_wrist_pitch_link/
left_wrist_yaw_joint [⚙+Z] => /left_wrist_yaw_link/
L_base_link_joint [⚓] => /L_hand_base_link/
L_thumb_proximal_yaw_joint [⚙+Z] => /L_thumb_proximal_base/
L_thumb_proximal_pitch_joint [⚙-Z] => /L_thumb_proximal/
L_thumb_intermediate_joint [⚙-Z] => /L_thumb_intermediate/
L_thumb_distal_joint [⚙-Z] => /L_thumb_distal/
L_index_proximal_joint [⚙-Z] => /L_index_proximal/
L_index_intermediate_joint [⚙-Z] => /L_index_intermediate/
L_middle_proximal_joint [⚙-Z] => /L_middle_proximal/
L_middle_intermediate_joint [⚙-Z] => /L_middle_intermediate/
L_ring_proximal_joint [⚙-Z] => /L_ring_proximal/
L_ring_intermediate_joint [⚙-Z] => /L_ring_intermediate/
L_pinky_proximal_joint [⚙-Z] => /L_pinky_proximal/
L_pinky_intermediate_joint [⚙-Z] => /L_pinky_intermediate/
right_shoulder_pitch_joint [⚙+Y] => /right_shoulder_pitch_link/
right_shoulder_roll_joint [⚙+X] => /right_shoulder_roll_link/
right_shoulder_yaw_joint [⚙+Z] => /right_shoulder_yaw_link/
right_elbow_pitch_joint [⚙+Y] => /right_elbow_pitch_link/
right_elbow_roll_joint [⚙+X] => /right_elbow_roll_link/
right_wrist_pitch_joint [⚙+Y] => /right_wrist_pitch_link/
right_wrist_yaw_joint [⚙+Z] => /right_wrist_yaw_link/
R_base_link_joint [⚓] => /R_hand_base_link/
R_thumb_proximal_yaw_joint [⚙-Z] => /R_thumb_proximal_base/
R_thumb_proximal_pitch_joint [⚙+Z] => /R_thumb_proximal/
R_thumb_intermediate_joint [⚙+Z] => /R_thumb_intermediate/
R_thumb_distal_joint [⚙+Z] => /R_thumb_distal/
R_index_proximal_joint [⚙+Z] => /R_index_proximal/
R_index_intermediate_joint [⚙+Z] => /R_index_intermediate/
R_middle_proximal_joint [⚙+Z] => /R_middle_proximal/
R_middle_intermediate_joint [⚙+Z] => /R_middle_intermediate/
R_ring_proximal_joint [⚙+Z] => /R_ring_proximal/
R_ring_intermediate_joint [⚙+Z] => /R_ring_intermediate/
R_pinky_proximal_joint [⚙+Z] => /R_pinky_proximal/
R_pinky_intermediate_joint [⚙+Z] => /R_pinky_intermediate/
```
## Visulization with [MuJoCo](https://github.com/google-deepmind/mujoco)
1. Open MuJoCo Viewer
```bash
pip install mujoco
python -m mujoco.viewer
```
2. Drag and drop the MJCF model file (`scene.xml`) to the MuJoCo Viewer.

1623
assets/H1_5/h1_5.urdf
File diff suppressed because it is too large
View File

438
assets/H1_5/h1_5.xml

@ -0,0 +1,438 @@
<mujoco model="h1_5">
<compiler angle="radian" meshdir="meshes/" autolimits="true"/>
<statistic meansize="0.112107" extent="1.95557" center="0.0256948 1.86841e-05 -0.178443"/>
<asset>
<mesh name="pelvis" file="pelvis.STL"/>
<mesh name="left_hip_yaw_link" file="left_hip_yaw_link.STL"/>
<mesh name="left_hip_pitch_link" file="left_hip_pitch_link.STL"/>
<mesh name="left_hip_roll_link" file="left_hip_roll_link.STL"/>
<mesh name="left_knee_link" file="left_knee_link.STL"/>
<mesh name="left_ankle_pitch_link" file="left_ankle_pitch_link.STL"/>
<mesh name="left_ankle_roll_link" file="left_ankle_roll_link.STL"/>
<mesh name="right_hip_yaw_link" file="right_hip_yaw_link.STL"/>
<mesh name="right_hip_pitch_link" file="right_hip_pitch_link.STL"/>
<mesh name="right_hip_roll_link" file="right_hip_roll_link.STL"/>
<mesh name="right_knee_link" file="right_knee_link.STL"/>
<mesh name="right_ankle_pitch_link" file="right_ankle_pitch_link.STL"/>
<mesh name="right_ankle_roll_link" file="right_ankle_roll_link.STL"/>
<mesh name="torso_link" file="torso_link.STL"/>
<mesh name="left_shoulder_pitch_link" file="left_shoulder_pitch_link.STL"/>
<mesh name="left_shoulder_roll_link" file="left_shoulder_roll_link.STL"/>
<mesh name="left_shoulder_yaw_link" file="left_shoulder_yaw_link.STL"/>
<mesh name="left_elbow_pitch_link" file="left_elbow_pitch_link.STL"/>
<mesh name="left_elbow_roll_link" file="left_elbow_roll_link.STL"/>
<mesh name="left_wrist_pitch_link" file="left_wrist_pitch_link.STL"/>
<mesh name="wrist_yaw_link" file="wrist_yaw_link.STL"/>
<mesh name="right_shoulder_pitch_link" file="right_shoulder_pitch_link.STL"/>
<mesh name="right_shoulder_roll_link" file="right_shoulder_roll_link.STL"/>
<mesh name="right_shoulder_yaw_link" file="right_shoulder_yaw_link.STL"/>
<mesh name="right_elbow_pitch_link" file="right_elbow_pitch_link.STL"/>
<mesh name="right_elbow_roll_link" file="right_elbow_roll_link.STL"/>
<mesh name="right_wrist_pitch_link" file="right_wrist_pitch_link.STL"/>
<mesh name="logo_link" file="logo_link.STL"/>
<mesh name="L_hand_base_link" file="L_hand_base_link.STL"/>
<mesh name="link11_L" file="link11_L.STL"/>
<mesh name="link12_L" file="link12_L.STL"/>
<mesh name="link13_L" file="link13_L.STL"/>
<mesh name="link14_L" file="link14_L.STL"/>
<mesh name="link15_L" file="link15_L.STL"/>
<mesh name="link16_L" file="link16_L.STL"/>
<mesh name="link17_L" file="link17_L.STL"/>
<mesh name="link18_L" file="link18_L.STL"/>
<mesh name="link19_L" file="link19_L.STL"/>
<mesh name="link20_L" file="link20_L.STL"/>
<mesh name="link21_L" file="link21_L.STL"/>
<mesh name="link22_L" file="link22_L.STL"/>
<mesh name="R_hand_base_link" file="R_hand_base_link.STL"/>
<mesh name="link11_R" file="link11_R.STL"/>
<mesh name="link12_R" file="link12_R.STL"/>
<mesh name="link13_R" file="link13_R.STL"/>
<mesh name="link14_R" file="link14_R.STL"/>
<mesh name="link15_R" file="link15_R.STL"/>
<mesh name="link16_R" file="link16_R.STL"/>
<mesh name="link17_R" file="link17_R.STL"/>
<mesh name="link18_R" file="link18_R.STL"/>
<mesh name="link19_R" file="link19_R.STL"/>
<mesh name="link20_R" file="link20_R.STL"/>
<mesh name="link21_R" file="link21_R.STL"/>
<mesh name="link22_R" file="link22_R.STL"/>
</asset>
<worldbody>
<body name="pelvis" pos="0 0 1.03">
<inertial pos="-0.0004 3.7e-05 -0.046864" quat="0.497097 0.496809 -0.503132 0.502925" mass="5.983" diaginertia="0.0531565 0.0491678 0.00902583"/>
<joint name="floating_base_joint" type="free"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="pelvis"/>
<geom size="0.05" rgba="0.1 0.1 0.1 1"/>
<body name="left_hip_yaw_link" pos="0 0.0875 -0.1632">
<inertial pos="0 -0.026197 0.006647" quat="0.704899 -0.0553755 0.0548434 0.705013" mass="2.829" diaginertia="0.00574303 0.00455361 0.00349461"/>
<joint name="left_hip_yaw_joint" pos="0 0 0" axis="0 0 1" range="-0.43 0.43"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_hip_yaw_link"/>
<geom size="0.01 0.01" pos="0.02 0 0" quat="0.707107 0 0.707107 0" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="left_hip_pitch_link" pos="0 0.0755 0">
<inertial pos="-0.00781 -0.004724 -6.3e-05" quat="0.701575 0.711394 0.0330266 0.0249149" mass="2.92" diaginertia="0.00560661 0.00445055 0.00385068"/>
<joint name="left_hip_pitch_joint" pos="0 0 0" axis="0 1 0" range="-3.14 2.5"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_hip_pitch_link"/>
<geom size="0.02 0.1" pos="0 0 -0.2" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="left_hip_roll_link">
<inertial pos="0.004171 -0.008576 -0.194509" quat="0.634842 0.0146079 0.0074063 0.772469" mass="4.962" diaginertia="0.0480229 0.0462788 0.00887409"/>
<joint name="left_hip_roll_joint" pos="0 0 0" axis="1 0 0" range="-0.43 3.14"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_hip_roll_link"/>
<geom size="0.02 0.005" pos="0 0.06 0" quat="0.707107 0.707107 0 0" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="left_knee_link" pos="0 0 -0.4">
<inertial pos="0.000179 0.000121 -0.168936" quat="0.416585 0.0104983 0.00514003 0.909021" mass="3.839" diaginertia="0.0391044 0.038959 0.00501125"/>
<joint name="left_knee_joint" pos="0 0 0" axis="0 1 0" range="-0.26 2.05"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_knee_link"/>
<geom size="0.02 0.1" pos="0 0 -0.2" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="left_ankle_pitch_link" pos="0 0 -0.4">
<inertial pos="-0.000294 0 -0.010794" quat="0.999984 0 -0.00574445 0" mass="0.102" diaginertia="2.39454e-05 2.1837e-05 1.34126e-05"/>
<joint name="left_ankle_pitch_joint" pos="0 0 0" axis="0 1 0" range="-0.897334 0.523598"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_ankle_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_ankle_pitch_link"/>
<body name="left_ankle_roll_link" pos="0 0 -0.02">
<inertial pos="0.029589 0 -0.015973" quat="0 0.725858 0 0.687845" mass="0.747" diaginertia="0.00359178 0.00343534 0.000640307"/>
<joint name="left_ankle_roll_joint" pos="0 0 0" axis="1 0 0" range="-0.261799 0.261799"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_ankle_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_ankle_roll_link"/>
</body>
</body>
</body>
</body>
</body>
</body>
<body name="right_hip_yaw_link" pos="0 -0.0875 -0.1632">
<inertial pos="0 0.026197 0.006647" quat="0.705013 0.0548434 -0.0553755 0.704899" mass="2.829" diaginertia="0.00574303 0.00455361 0.00349461"/>
<joint name="right_hip_yaw_joint" pos="0 0 0" axis="0 0 1" range="-0.43 0.43"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_hip_yaw_link"/>
<geom size="0.01 0.01" pos="0.02 0 0" quat="0.707107 0 0.707107 0" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="right_hip_pitch_link" pos="0 -0.0755 0">
<inertial pos="-0.00781 0.004724 -6.3e-05" quat="0.711394 0.701575 -0.0249149 -0.0330266" mass="2.92" diaginertia="0.00560661 0.00445055 0.00385068"/>
<joint name="right_hip_pitch_joint" pos="0 0 0" axis="0 1 0" range="-3.14 2.5"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_hip_pitch_link"/>
<geom size="0.02 0.1" pos="0 0 -0.2" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="right_hip_roll_link">
<inertial pos="0.004171 0.008576 -0.194509" quat="0.772469 0.0074063 0.0146079 0.634842" mass="4.962" diaginertia="0.0480229 0.0462788 0.00887409"/>
<joint name="right_hip_roll_joint" pos="0 0 0" axis="1 0 0" range="-3.14 0.43"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_hip_roll_link"/>
<geom size="0.02 0.005" pos="0 -0.06 0" quat="0.707107 0.707107 0 0" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="right_knee_link" pos="0 0 -0.4">
<inertial pos="0.000179 -0.000121 -0.168936" quat="0.909021 0.00514003 0.0104983 0.416585" mass="3.839" diaginertia="0.0391044 0.038959 0.00501125"/>
<joint name="right_knee_joint" pos="0 0 0" axis="0 1 0" range="-0.26 2.05"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_knee_link"/>
<geom size="0.02 0.1" pos="0 0 -0.2" type="cylinder" rgba="0.1 0.1 0.1 1"/>
<body name="right_ankle_pitch_link" pos="0 0 -0.4">
<inertial pos="-0.000294 0 -0.010794" quat="0.999984 0 -0.00574445 0" mass="0.102" diaginertia="2.39454e-05 2.1837e-05 1.34126e-05"/>
<joint name="right_ankle_pitch_joint" pos="0 0 0" axis="0 1 0" range="-0.897334 0.523598"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_ankle_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_ankle_pitch_link"/>
<body name="right_ankle_roll_link" pos="0 0 -0.02">
<inertial pos="0.029589 0 -0.015973" quat="0 0.725858 0 0.687845" mass="0.747" diaginertia="0.00359178 0.00343534 0.000640307"/>
<joint name="right_ankle_roll_joint" pos="0 0 0" axis="1 0 0" range="-0.261799 0.261799"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_ankle_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_ankle_roll_link"/>
</body>
</body>
</body>
</body>
</body>
</body>
<body name="torso_link">
<inertial pos="0.000489 0.002797 0.20484" quat="0.999989 -0.00130808 -0.00282289 -0.00349105" mass="17.789" diaginertia="0.487315 0.409628 0.127837"/>
<joint name="torso_joint" pos="0 0 0" axis="0 0 1" range="-2.35 2.35"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="torso_link"/>
<geom size="0.04 0.08 0.05" pos="0 0 0.15" type="box" rgba="0.1 0.1 0.1 1"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="1 1 1 1" mesh="logo_link"/>
<site name="imu" size="0.01" pos="-0.04452 -0.01891 0.27756"/>
<body name="left_shoulder_pitch_link" pos="0 0.14806 0.42333" quat="0.991445 0.130526 0 0">
<inertial pos="0.003053 0.06042 -0.0059" quat="0.761799 0.645681 -0.0378496 -0.0363943" mass="1.327" diaginertia="0.000588757 0.00053309 0.000393023"/>
<joint name="left_shoulder_pitch_joint" pos="0 0 0" axis="0 1 0" range="-3.14 1.57"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_pitch_link"/>
<body name="left_shoulder_roll_link" pos="0.0342 0.061999 -0.0060011" quat="0.991445 -0.130526 0 0">
<inertial pos="-0.030932 -1e-06 -0.10609" quat="0.986055 0.000456937 0.166408 0.00213553" mass="1.393" diaginertia="0.00200869 0.00193464 0.000449847"/>
<joint name="left_shoulder_roll_joint" pos="0 0 0" axis="1 0 0" range="-0.38 3.4"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_roll_link"/>
<body name="left_shoulder_yaw_link" pos="-0.0342 0 -0.1456">
<inertial pos="0.004583 0.001128 -0.001128" quat="0.663644 -0.0108866 -0.0267235 0.747492" mass="1.505" diaginertia="0.00431782 0.00420697 0.000645658"/>
<joint name="left_shoulder_yaw_joint" pos="0 0 0" axis="0 0 1" range="-3.01 2.66"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_yaw_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_shoulder_yaw_link"/>
<body name="left_elbow_pitch_link" pos="0.006 0.0329 -0.182">
<inertial pos="0.077092 -0.028751 -0.009714" quat="0.544921 0.610781 0.423352 0.388305" mass="0.691" diaginertia="0.000942091 0.000905273 0.00023025"/>
<joint name="left_elbow_pitch_joint" pos="0 0 0" axis="0 1 0" range="-2.53 1.6"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_elbow_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_elbow_pitch_link"/>
<body name="left_elbow_roll_link" pos="0.121 -0.0329 -0.011">
<inertial pos="0.035281 -0.00232 0.000337" quat="0.334998 0.622198 -0.240131 0.66557" mass="0.683" diaginertia="0.00034681 0.000328248 0.000294628"/>
<joint name="left_elbow_roll_joint" pos="0 0 0" axis="1 0 0" range="-2.967 2.967"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_elbow_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_elbow_roll_link"/>
<body name="left_wrist_pitch_link" pos="0.085 0 0">
<inertial pos="0.020395 3.6e-05 -0.002973" quat="0.915893 -0.228405 -0.327262 -0.0432527" mass="0.484" diaginertia="7.25675e-05 7.00325e-05 6.9381e-05"/>
<joint name="left_wrist_pitch_joint" pos="0 0 0" axis="0 1 0" range="-0.471 0.349"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="left_wrist_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="left_wrist_pitch_link"/>
<body name="left_wrist_yaw_link" pos="0.02 0 0">
<inertial pos="0.0770303 -0.00131441 -0.00068617" quat="0.499919 0.510625 0.506813 0.482165" mass="0.26543" diaginertia="0.000854397 0.000723298 0.00022115"/>
<joint name="left_wrist_yaw_joint" pos="0 0 0" axis="0 0 1" range="-1.012 1.012"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="wrist_yaw_link"/>
<geom pos="0.054 0 0" quat="0.707107 0 0 0.707107" type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="L_hand_base_link"/>
<geom pos="0.054 0 0" quat="0.707107 0 0 0.707107" type="mesh" rgba="0.1 0.1 0.1 1" mesh="L_hand_base_link"/>
<body name="L_thumb_proximal_base" pos="0.1231 -0.01696 0.02045" quat="-2.59735e-06 0.707107 0 0.707107">
<inertial pos="0.0048817 0.00038782 -0.00722" quat="0.445981 0.352284 0.495833 0.656617" mass="0.0018869" diaginertia="8.66031e-08 6.87331e-08 4.94199e-08"/>
<joint name="L_thumb_proximal_yaw_joint" pos="0 0 0" axis="0 0 1" range="-0.1 1.3"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link11_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link11_L"/>
<body name="L_thumb_proximal" pos="0.0099867 0.0098242 -0.0089" quat="0.704571 -0.704573 -0.0598169 0.0598167">
<inertial pos="0.021936 -0.01279 -0.0080386" quat="0.25452 0.660687 -0.251949 0.659723" mass="0.0066101" diaginertia="2.78701e-06 2.44024e-06 8.6466e-07"/>
<joint name="L_thumb_proximal_pitch_joint" pos="0 0 0" axis="0 0 -1" range="-0.1 0.6"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link12_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link12_L"/>
<body name="L_thumb_intermediate" pos="0.04407 -0.034553 -0.0008">
<inertial pos="0.0095531 0.0016282 -0.0072002" quat="0.30738 0.636732 -0.307526 0.636803" mass="0.0037844" diaginertia="4.6532e-07 4.48114e-07 2.45646e-07"/>
<joint name="L_thumb_intermediate_joint" pos="0 0 0" axis="0 0 -1" range="0 0.8"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link13_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link13_L"/>
<body name="L_thumb_distal" pos="0.020248 -0.010156 -0.0012">
<inertial pos="0.0092888 -0.004953 -0.0060033" quat="0.266264 0.65596 -0.262836 0.655544" mass="0.003344" diaginertia="2.0026e-07 1.95246e-07 8.1594e-08"/>
<joint name="L_thumb_distal_joint" pos="0 0 0" axis="0 0 -1" range="0 1.2"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link14_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link14_L"/>
</body>
</body>
</body>
</body>
<body name="L_index_proximal" pos="0.19053 0.00028533 0.032268" quat="0.706999 -0.0123409 -0.0123409 0.706999">
<inertial pos="0.0012971 -0.011934 -0.0059998" quat="0.489677 0.510115 -0.489692 0.510099" mass="0.0042405" diaginertia="6.9402e-07 6.62904e-07 2.10916e-07"/>
<joint name="L_index_proximal_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link15_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link15_L"/>
<body name="L_index_intermediate" pos="-0.0024229 -0.032041 -0.001">
<inertial pos="0.0021753 -0.019567 -0.005" quat="0.528694 0.469555 -0.528694 0.469555" mass="0.0045682" diaginertia="7.8176e-07 7.72427e-07 8.47209e-08"/>
<joint name="L_index_intermediate_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link16_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link16_L"/>
</body>
</body>
<body name="L_middle_proximal" pos="0.1911 0.00028533 0.01295" quat="0.707107 0 0 0.707107">
<inertial pos="0.0012971 -0.011934 -0.0059999" quat="0.489677 0.510115 -0.489692 0.510099" mass="0.0042405" diaginertia="6.9402e-07 6.62904e-07 2.10916e-07"/>
<joint name="L_middle_proximal_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link17_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link17_L"/>
<body name="L_middle_intermediate" pos="-0.0024229 -0.032041 -0.001">
<inertial pos="0.001921 -0.020796 -0.0049999" quat="0.531603 0.466115 -0.531728 0.466262" mass="0.0050397" diaginertia="9.8385e-07 9.73288e-07 9.14016e-08"/>
<joint name="L_middle_intermediate_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link18_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link18_L"/>
</body>
</body>
<body name="L_ring_proximal" pos="0.19091 0.00028533 -0.0062872" quat="0.706864 0.0185099 0.0185099 0.706864">
<inertial pos="0.0012971 -0.011934 -0.0059999" quat="0.489677 0.510114 -0.489692 0.510099" mass="0.0042405" diaginertia="6.9402e-07 6.62904e-07 2.10916e-07"/>
<joint name="L_ring_proximal_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link19_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link19_L"/>
<body name="L_ring_intermediate" pos="-0.0024229 -0.032041 -0.001">
<inertial pos="0.0021753 -0.019567 -0.005" quat="0.528694 0.469556 -0.528694 0.469556" mass="0.0045682" diaginertia="7.8176e-07 7.72437e-07 8.47208e-08"/>
<joint name="L_ring_intermediate_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link20_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link20_L"/>
</body>
</body>
<body name="L_pinky_proximal" pos="0.18971 0.00028533 -0.025488" quat="0.706138 0.0370072 0.0370072 0.706138">
<inertial pos="0.0012971 -0.011934 -0.0059999" quat="0.489677 0.510114 -0.489692 0.510099" mass="0.0042405" diaginertia="6.9402e-07 6.62904e-07 2.10916e-07"/>
<joint name="L_pinky_proximal_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link21_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link21_L"/>
<body name="L_pinky_intermediate" pos="-0.0024229 -0.032041 -0.001">
<inertial pos="0.0024788 -0.016208 -0.0050001" quat="0.526797 0.471683 -0.526793 0.471687" mass="0.0036036" diaginertia="4.4881e-07 4.43809e-07 6.5736e-08"/>
<joint name="L_pinky_intermediate_joint" pos="0 0 0" axis="0 0 -1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link22_L"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link22_L"/>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
<body name="right_shoulder_pitch_link" pos="0 -0.14806 0.42333" quat="0.991445 -0.130526 0 0">
<inertial pos="0.003053 -0.06042 -0.0059" quat="0.645681 0.761799 0.0363943 0.0378496" mass="1.327" diaginertia="0.000588757 0.00053309 0.000393023"/>
<joint name="right_shoulder_pitch_joint" pos="0 0 0" axis="0 1 0" range="-1.57 3.14"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_pitch_link"/>
<body name="right_shoulder_roll_link" pos="0.0342 -0.061999 -0.0060011" quat="0.991445 0.130526 0 0">
<inertial pos="-0.030932 1e-06 -0.10609" quat="0.986055 -0.000456937 0.166408 -0.00213553" mass="1.393" diaginertia="0.00200869 0.00193464 0.000449847"/>
<joint name="right_shoulder_roll_joint" pos="0 0 0" axis="1 0 0" range="-3.4 0.38"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_roll_link"/>
<body name="right_shoulder_yaw_link" pos="-0.0342 0 -0.1456">
<inertial pos="0.004583 -0.001128 -0.001128" quat="0.747492 -0.0267235 -0.0108866 0.663644" mass="1.505" diaginertia="0.00431782 0.00420697 0.000645658"/>
<joint name="right_shoulder_yaw_joint" pos="0 0 0" axis="0 0 1" range="-2.66 3.01"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_yaw_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_shoulder_yaw_link"/>
<body name="right_elbow_pitch_link" pos="0.006 -0.0329 -0.182">
<inertial pos="0.077092 0.028751 -0.009714" quat="0.388305 0.423352 0.610781 0.544921" mass="0.691" diaginertia="0.000942091 0.000905273 0.00023025"/>
<joint name="right_elbow_pitch_joint" pos="0 0 0" axis="0 1 0" range="-1.6 2.53"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_elbow_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_elbow_pitch_link"/>
<body name="right_elbow_roll_link" pos="0.121 0.0329 -0.011">
<inertial pos="0.035281 -0.00232 0.000337" quat="0.334998 0.622198 -0.240131 0.66557" mass="0.683" diaginertia="0.00034681 0.000328248 0.000294628"/>
<joint name="right_elbow_roll_joint" pos="0 0 0" axis="1 0 0" range="-2.967 2.967"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_elbow_roll_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_elbow_roll_link"/>
<body name="right_wrist_pitch_link" pos="0.085 0 0">
<inertial pos="0.020395 3.6e-05 -0.002973" quat="0.915893 -0.228405 -0.327262 -0.0432527" mass="0.484" diaginertia="7.25675e-05 7.00325e-05 6.9381e-05"/>
<joint name="right_wrist_pitch_joint" pos="0 0 0" axis="0 1 0" range="-0.471 0.349"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="right_wrist_pitch_link"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="right_wrist_pitch_link"/>
<body name="right_wrist_yaw_link" pos="0.02 0 0">
<inertial pos="0.0770303 0.0013013 -0.000699011" quat="0.482149 0.506915 0.510629 0.499827" mass="0.26543" diaginertia="0.00085381 0.000722728 0.000221145"/>
<joint name="right_wrist_yaw_joint" pos="0 0 0" axis="0 0 1" range="-1.012 1.012"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="wrist_yaw_link"/>
<geom pos="0.054 0 0" quat="0 0.707107 -0.707107 0" type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="R_hand_base_link"/>
<geom pos="0.054 0 0" quat="0 0.707107 -0.707107 0" type="mesh" rgba="0.1 0.1 0.1 1" mesh="R_hand_base_link"/>
<body name="R_thumb_proximal_base" pos="0.1231 0.01696 0.02045" quat="-0.707107 -2.59735e-06 -0.707107 0">
<inertial pos="-0.0048064 0.0009382 -0.00757" quat="0.515015 0.680854 0.408023 0.323596" mass="0.0018869" diaginertia="8.66026e-08 6.8732e-08 4.94194e-08"/>
<joint name="R_thumb_proximal_yaw_joint" pos="0 0 0" axis="0 0 -1" range="-0.1 1.3"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link11_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link11_R"/>
<body name="R_thumb_proximal" pos="-0.0088099 0.010892 -0.00925" quat="0.0996843 0.0996847 0.700046 0.700044">
<inertial pos="0.021932 0.012785 -0.0080386" quat="-0.254474 0.660716 0.251893 0.659733" mass="0.0066075" diaginertia="2.78601e-06 2.43933e-06 8.64566e-07"/>
<joint name="R_thumb_proximal_pitch_joint" pos="0 0 0" axis="0 0 1" range="-0.1 0.6"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link12_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link12_R"/>
<body name="R_thumb_intermediate" pos="0.04407 0.034553 -0.0008">
<inertial pos="0.0095544 -0.0016282 -0.0071997" quat="0.636718 0.307389 -0.636802 0.307548" mass="0.0037847" diaginertia="4.6531e-07 4.48089e-07 2.45661e-07"/>
<joint name="R_thumb_intermediate_joint" pos="0 0 0" axis="0 0 1" range="0 0.8"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link13_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link13_R"/>
<body name="R_thumb_distal" pos="0.020248 0.010156 -0.0012">
<inertial pos="0.0092888 0.0049529 -0.0060033" quat="-0.266294 0.655967 0.262806 0.655537" mass="0.0033441" diaginertia="2.0026e-07 1.95247e-07 8.1593e-08"/>
<joint name="R_thumb_distal_joint" pos="0 0 0" axis="0 0 1" range="0 1.2"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link14_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link14_R"/>
</body>
</body>
</body>
</body>
<body name="R_index_proximal" pos="0.19053 -0.00028533 0.032268" quat="0.706999 0.0123358 -0.0123358 -0.706999">
<inertial pos="0.0012259 0.011942 -0.0060001" quat="0.50867 0.49121 -0.508643 0.491172" mass="0.0042403" diaginertia="6.9398e-07 6.62871e-07 2.10909e-07"/>
<joint name="R_index_proximal_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link15_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link15_R"/>
<body name="R_index_intermediate" pos="-0.0026138 0.032026 -0.001">
<inertial pos="0.0019697 0.019589 -0.005" quat="0.466773 0.531152 -0.466773 0.531153" mass="0.0045683" diaginertia="7.8179e-07 7.72465e-07 8.47212e-08"/>
<joint name="R_index_intermediate_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link16_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link16_R"/>
</body>
</body>
<body name="R_middle_proximal" pos="0.1911 -0.00028533 0.01295" quat="0.707107 -2.59735e-06 2.59735e-06 -0.707107">
<inertial pos="0.001297 0.011934 -0.0060001" quat="0.510131 0.489693 -0.510105 0.489653" mass="0.0042403" diaginertia="6.9397e-07 6.62865e-07 2.10915e-07"/>
<joint name="R_middle_proximal_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link17_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link17_R"/>
<body name="R_middle_intermediate" pos="-0.0024229 0.032041 -0.001">
<inertial pos="0.001921 0.020796 -0.005" quat="0.466148 0.531627 -0.466229 0.531705" mass="0.0050396" diaginertia="9.8384e-07 9.73279e-07 9.14014e-08"/>
<joint name="R_middle_intermediate_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link18_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link18_R"/>
</body>
</body>
<body name="R_ring_proximal" pos="0.19091 -0.00028533 -0.0062872" quat="-0.706864 0.0185215 -0.0185215 0.706864">
<inertial pos="0.001297 0.011934 -0.0060002" quat="0.510129 0.489691 -0.510107 0.489654" mass="0.0042403" diaginertia="6.9397e-07 6.62865e-07 2.10915e-07"/>
<joint name="R_ring_proximal_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link19_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link19_R"/>
<body name="R_ring_intermediate" pos="-0.0024229 0.032041 -0.001">
<inertial pos="0.0021753 0.019567 -0.005" quat="0.469554 0.528695 -0.469554 0.528695" mass="0.0045683" diaginertia="7.8177e-07 7.72448e-07 8.4722e-08"/>
<joint name="R_ring_intermediate_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link20_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link20_R"/>
</body>
</body>
<body name="R_pinky_proximal" pos="0.18971 -0.00028533 -0.025488" quat="-0.706138 0.0369975 -0.0369975 0.706138">
<inertial pos="0.001297 0.011934 -0.0060001" quat="0.51013 0.489693 -0.510106 0.489653" mass="0.0042403" diaginertia="6.9397e-07 6.62865e-07 2.10915e-07"/>
<joint name="R_pinky_proximal_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link21_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link21_R"/>
<body name="R_pinky_intermediate" pos="-0.0024229 0.032041 -0.001">
<inertial pos="0.0024748 0.016203 -0.0050031" quat="0.47398 0.528862 -0.469291 0.524799" mass="0.0035996" diaginertia="4.4867e-07 4.43723e-07 6.56538e-08"/>
<joint name="R_pinky_intermediate_joint" pos="0 0 0" axis="0 0 1" range="0 1.7"/>
<geom type="mesh" contype="0" conaffinity="0" group="1" density="0" rgba="0.1 0.1 0.1 1" mesh="link22_R"/>
<geom type="mesh" rgba="0.1 0.1 0.1 1" mesh="link22_R"/>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</body>
</worldbody>
<actuator>
<motor name="left_hip_yaw_joint" joint="left_hip_yaw_joint" ctrlrange="-200 200"/>
<motor name="left_hip_pitch_joint" joint="left_hip_pitch_joint" ctrlrange="-200 200"/>
<motor name="left_hip_roll_joint" joint="left_hip_roll_joint" ctrlrange="-200 200"/>
<motor name="left_knee_joint" joint="left_knee_joint" ctrlrange="-300 300"/>
<motor name="left_ankle_pitch_joint" joint="left_ankle_pitch_joint" ctrlrange="-60 60"/>
<motor name="left_ankle_roll_joint" joint="left_ankle_roll_joint" ctrlrange="-40 40"/>
<motor name="right_hip_yaw_joint" joint="right_hip_yaw_joint" ctrlrange="-200 200"/>
<motor name="right_hip_pitch_joint" joint="right_hip_pitch_joint" ctrlrange="-200 200"/>
<motor name="right_hip_roll_joint" joint="right_hip_roll_joint" ctrlrange="-200 200"/>
<motor name="right_knee_joint" joint="right_knee_joint" ctrlrange="-300 300"/>
<motor name="right_ankle_pitch_joint" joint="right_ankle_pitch_joint" ctrlrange="-60 60"/>
<motor name="right_ankle_roll_joint" joint="right_ankle_roll_joint" ctrlrange="-40 40"/>
<motor name="torso_joint" joint="torso_joint" ctrlrange="-200 200"/>
<motor name="left_shoulder_pitch_joint" joint="left_shoulder_pitch_joint" ctrlrange="-40 40"/>
<motor name="left_shoulder_roll_joint" joint="left_shoulder_roll_joint" ctrlrange="-40 40"/>
<motor name="left_shoulder_yaw_joint" joint="left_shoulder_yaw_joint" ctrlrange="-18 18"/>
<motor name="left_elbow_pitch_joint" joint="left_elbow_pitch_joint" ctrlrange="-18 18"/>
<motor name="left_elbow_roll_joint" joint="left_elbow_roll_joint" ctrlrange="-19 19"/>
<motor name="left_wrist_pitch_joint" joint="left_wrist_pitch_joint" ctrlrange="-19 19"/>
<motor name="left_wrist_yaw_joint" joint="left_wrist_yaw_joint" ctrlrange="-19 19"/>
<motor name="right_shoulder_pitch_joint" joint="right_shoulder_pitch_joint" ctrlrange="-40 40"/>
<motor name="right_shoulder_roll_joint" joint="right_shoulder_roll_joint" ctrlrange="-40 40"/>
<motor name="right_shoulder_yaw_joint" joint="right_shoulder_yaw_joint" ctrlrange="-18 18"/>
<motor name="right_elbow_pitch_joint" joint="right_elbow_pitch_joint" ctrlrange="-18 18"/>
<motor name="right_elbow_roll_joint" joint="right_elbow_roll_joint" ctrlrange="-19 19"/>
<motor name="right_wrist_pitch_joint" joint="right_wrist_pitch_joint" ctrlrange="-19 19"/>
<motor name="right_wrist_yaw_joint" joint="right_wrist_yaw_joint" ctrlrange="-19 19"/>
<motor name="L_index_proximal_joint" joint="L_index_proximal_joint" ctrlrange="-1 1"/>
<motor name="L_index_intermediate_joint" joint="L_index_intermediate_joint" ctrlrange="-1 1"/>
<motor name="L_middle_proximal_joint" joint="L_middle_proximal_joint" ctrlrange="-1 1"/>
<motor name="L_middle_intermediate_joint" joint="L_middle_intermediate_joint" ctrlrange="-1 1"/>
<motor name="L_ring_proximal_joint" joint="L_ring_proximal_joint" ctrlrange="-1 1"/>
<motor name="L_ring_intermediate_joint" joint="L_ring_intermediate_joint" ctrlrange="-1 1"/>
<motor name="L_pinky_proximal_joint" joint="L_pinky_proximal_joint" ctrlrange="-1 1"/>
<motor name="L_pinky_intermediate_joint" joint="L_pinky_intermediate_joint" ctrlrange="-1 1"/>
<motor name="L_thumb_proximal_yaw_joint" joint="L_thumb_proximal_yaw_joint" ctrlrange="-1 1"/>
<motor name="L_thumb_proximal_pitch_joint" joint="L_thumb_proximal_pitch_joint" ctrlrange="-1 1"/>
<motor name="L_thumb_intermediate_joint" joint="L_thumb_intermediate_joint" ctrlrange="-1 1"/>
<motor name="L_thumb_distal_joint" joint="L_thumb_distal_joint" ctrlrange="-1 1"/>
<motor name="R_index_proximal_joint" joint="R_index_proximal_joint" ctrlrange="-1 1"/>
<motor name="R_index_intermediate_joint" joint="R_index_intermediate_joint" ctrlrange="-1 1"/>
<motor name="R_middle_proximal_joint" joint="R_middle_proximal_joint" ctrlrange="-1 1"/>
<motor name="R_middle_intermediate_joint" joint="R_middle_intermediate_joint" ctrlrange="-1 1"/>
<motor name="R_ring_proximal_joint" joint="R_ring_proximal_joint" ctrlrange="-1 1"/>
<motor name="R_ring_intermediate_joint" joint="R_ring_intermediate_joint" ctrlrange="-1 1"/>
<motor name="R_pinky_proximal_joint" joint="R_pinky_proximal_joint" ctrlrange="-1 1"/>
<motor name="R_pinky_intermediate_joint" joint="R_pinky_intermediate_joint" ctrlrange="-1 1"/>
<motor name="R_thumb_proximal_yaw_joint" joint="R_thumb_proximal_yaw_joint" ctrlrange="-1 1"/>
<motor name="R_thumb_proximal_pitch_joint" joint="R_thumb_proximal_pitch_joint" ctrlrange="-1 1"/>
<motor name="R_thumb_intermediate_joint" joint="R_thumb_intermediate_joint" ctrlrange="-1 1"/>
<motor name="R_thumb_distal_joint" joint="R_thumb_distal_joint" ctrlrange="-1 1"/>
</actuator>
<sensor>
<gyro name="imu-angular-velocity" site="imu" noise="5e-4" cutoff="34.9"/>
<accelerometer name="imu-linear-acceleration" site="imu" noise="1e-2" cutoff="157"/>
</sensor>
</mujoco>

BIN
assets/H1_5/meshes/L_hand_base_link.STL

BIN
assets/H1_5/meshes/L_index_intermediate.STL

BIN
assets/H1_5/meshes/L_index_proximal.STL

BIN
assets/H1_5/meshes/L_middle_intermediate.STL

BIN
assets/H1_5/meshes/L_middle_proximal.STL

BIN
assets/H1_5/meshes/L_pinky_intermediate.STL

BIN
assets/H1_5/meshes/L_pinky_proximal.STL

BIN
assets/H1_5/meshes/L_ring_intermediate.STL

BIN
assets/H1_5/meshes/L_ring_proximal.STL

BIN
assets/H1_5/meshes/L_thumb_distal.STL

BIN
assets/H1_5/meshes/L_thumb_intermediate.STL

BIN
assets/H1_5/meshes/L_thumb_proximal.STL

BIN
assets/H1_5/meshes/L_thumb_proximal_base.STL

BIN
assets/H1_5/meshes/R_hand_base_link.STL

BIN
assets/H1_5/meshes/R_index_intermediate.STL

BIN
assets/H1_5/meshes/R_index_proximal.STL

BIN
assets/H1_5/meshes/R_middle_intermediate.STL

BIN
assets/H1_5/meshes/R_middle_proximal.STL

BIN
assets/H1_5/meshes/R_pinky_intermediate.STL

BIN
assets/H1_5/meshes/R_pinky_proximal.STL

BIN
assets/H1_5/meshes/R_ring_intermediate.STL

BIN
assets/H1_5/meshes/R_ring_proximal.STL

BIN
assets/H1_5/meshes/R_thumb_distal.STL

BIN
assets/H1_5/meshes/R_thumb_intermediate.STL

BIN
assets/H1_5/meshes/R_thumb_proximal.STL

BIN
assets/H1_5/meshes/R_thumb_proximal_base.STL

BIN
assets/H1_5/meshes/left_ankle_A_link.STL

BIN
assets/H1_5/meshes/left_ankle_A_rod_link.STL

BIN
assets/H1_5/meshes/left_ankle_B_link.STL

BIN
assets/H1_5/meshes/left_ankle_B_rod_link.STL

BIN
assets/H1_5/meshes/left_ankle_pitch_link.STL

BIN
assets/H1_5/meshes/left_ankle_roll_link.STL

BIN
assets/H1_5/meshes/left_elbow_pitch_link.STL

BIN
assets/H1_5/meshes/left_elbow_roll_link.STL

BIN
assets/H1_5/meshes/left_hand_link.STL

BIN
assets/H1_5/meshes/left_hip_pitch_link.STL

BIN
assets/H1_5/meshes/left_hip_roll_link.STL

BIN
assets/H1_5/meshes/left_hip_yaw_link.STL

BIN
assets/H1_5/meshes/left_knee_link.STL

BIN
assets/H1_5/meshes/left_shoulder_pitch_link.STL

BIN
assets/H1_5/meshes/left_shoulder_roll_link.STL

BIN
assets/H1_5/meshes/left_shoulder_yaw_link.STL

BIN
assets/H1_5/meshes/left_wrist_pitch_link.STL

BIN
assets/H1_5/meshes/link11_L.STL

BIN
assets/H1_5/meshes/link11_R.STL

BIN
assets/H1_5/meshes/link12_L.STL

BIN
assets/H1_5/meshes/link12_R.STL

BIN
assets/H1_5/meshes/link13_L.STL

BIN
assets/H1_5/meshes/link13_R.STL

BIN
assets/H1_5/meshes/link14_L.STL

BIN
assets/H1_5/meshes/link14_R.STL

BIN
assets/H1_5/meshes/link15_L.STL

BIN
assets/H1_5/meshes/link15_R.STL

BIN
assets/H1_5/meshes/link16_L.STL

BIN
assets/H1_5/meshes/link16_R.STL

BIN
assets/H1_5/meshes/link17_L.STL

BIN
assets/H1_5/meshes/link17_R.STL

BIN
assets/H1_5/meshes/link18_L.STL

BIN
assets/H1_5/meshes/link18_R.STL

BIN
assets/H1_5/meshes/link19_L.STL

BIN
assets/H1_5/meshes/link19_R.STL

BIN
assets/H1_5/meshes/link20_L.STL

BIN
assets/H1_5/meshes/link20_R.STL

BIN
assets/H1_5/meshes/link21_L.STL

BIN
assets/H1_5/meshes/link21_R.STL

BIN
assets/H1_5/meshes/link22_L.STL

BIN
assets/H1_5/meshes/link22_R.STL

BIN
assets/H1_5/meshes/logo_link.STL

BIN
assets/H1_5/meshes/pelvis.STL

BIN
assets/H1_5/meshes/right_ankle_A_link.STL

BIN
assets/H1_5/meshes/right_ankle_A_rod_link.STL

BIN
assets/H1_5/meshes/right_ankle_B_link.STL

BIN
assets/H1_5/meshes/right_ankle_B_rod_link.STL

BIN
assets/H1_5/meshes/right_ankle_link.STL

BIN
assets/H1_5/meshes/right_ankle_pitch_link.STL

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save