Spaces:

facebook
/

vggsfm

Paused

App Files Files Community

JianyuanWang commited on Jun 26, 2024

Commit

b19c7bf

1 Parent(s): 27a6ae5

solve

Browse files

Files changed (4) hide show

app.py +3 -2
vggsfm_code/vggsfm/models/triangulator.py +1 -1
vggsfm_code/vggsfm/utils/triangulation.py +1 -1
vggsfm_code/vggsfm/utils/triangulation_helpers.py +32 -13

app.py CHANGED Viewed

@@ -42,7 +42,7 @@ def vggsfm_demo(
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    max_input_image = 20
     target_dir = f"input_images_{timestamp}"
     if os.path.exists(target_dir):
@@ -203,7 +203,7 @@ with gr.Blocks() as demo:
         <li>upload the images (.jpg, .png, etc.), or </li>
         <li>upload a video (.mp4, .mov, etc.) </li>
     </ul>
-    <p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first 20 image frames. </p>
     <p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
     <p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
     <p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
@@ -245,6 +245,7 @@ with gr.Blocks() as demo:
                 cache_examples=True,
                 )
     submit_btn.click(
         vggsfm_demo,
         [input_video, input_images, num_query_images, num_query_points],

     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    max_input_image = 25
     target_dir = f"input_images_{timestamp}"
     if os.path.exists(target_dir):
         <li>upload the images (.jpg, .png, etc.), or </li>
         <li>upload a video (.mp4, .mov, etc.) </li>
     </ul>
+    <p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first 25 image frames. </p>
     <p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
     <p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
     <p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
                 cache_examples=True,
                 )
     submit_btn.click(
         vggsfm_demo,
         [input_video, input_images, num_query_images, num_query_points],

vggsfm_code/vggsfm/models/triangulator.py CHANGED Viewed

@@ -323,7 +323,7 @@ class Triangulator(nn.Module):
         # We adopt LORANSAC here again
         best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
-            extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score, max_ransac_iters=128
         )
         # Determine valid tracks based on inlier numbers

         # We adopt LORANSAC here again
         best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
+            extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
         )
         # Determine valid tracks based on inlier numbers

vggsfm_code/vggsfm/utils/triangulation.py CHANGED Viewed

@@ -755,7 +755,7 @@ def iterative_global_BA(
     # triangulate tracks by LORANSAC
     best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
-        extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score, max_ransac_iters=128
     )
     best_triangulated_points[valid_tracks] = points3D_opt

     # triangulate tracks by LORANSAC
     best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
+        extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
     )
     best_triangulated_points[valid_tracks] = points3D_opt

vggsfm_code/vggsfm/utils/triangulation_helpers.py CHANGED Viewed

@@ -384,7 +384,7 @@ def generate_combinations(N):
     return comb_array
-def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50):
     """
     Local Refinement for triangulation
     """
@@ -392,7 +392,6 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
     batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
     points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
-    extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
     # The sets selected for local refinement
     lo_indices = sorted_indices[:, :lo_num]
@@ -402,18 +401,38 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
     lo_points1 = torch.zeros_like(points1_expand)
     lo_points1[lo_mask] = points1_expand[lo_mask]
-    lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
-    lo_mask = lo_mask.reshape(B * lo_num, N)
-    lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
-    # triangulate the inliers
-    triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
-        lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
-    )
-    triangulated_points = triangulated_points.reshape(B, lo_num, 3)
-    tri_angles = tri_angles.reshape(B, lo_num, -1)
-    invalid_che_mask = invalid_che_mask.reshape(B, lo_num)
     return triangulated_points, tri_angles, invalid_che_mask

     return comb_array
+def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50, low_mem=True):
     """
     Local Refinement for triangulation
     """
     batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
     points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
     # The sets selected for local refinement
     lo_indices = sorted_indices[:, :lo_num]
     lo_points1 = torch.zeros_like(points1_expand)
     lo_points1[lo_mask] = points1_expand[lo_mask]
+    if low_mem:
+        all_triangulated_points = []
+        all_tri_angles = []
+        all_invalid_che_mask = []
+        for loidx in range(lo_num):
+            triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
+                    extrinsics, lo_points1[:, loidx], mask=lo_mask[:, loidx], compute_tri_angle=True, check_cheirality=True
+                )
+            # Append the outputs to the respective lists
+            all_triangulated_points.append(triangulated_points[:, None])
+            all_tri_angles.append(tri_angles[:, None])
+            all_invalid_che_mask.append(invalid_che_mask[:,None])
+        triangulated_points = torch.cat(all_triangulated_points, dim=1)
+        tri_angles = torch.cat(all_tri_angles, dim=1)
+        invalid_che_mask = torch.cat(all_invalid_che_mask, dim=1)
+    else:
+        extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
+        lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
+        lo_mask = lo_mask.reshape(B * lo_num, N)
+        lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
+        # triangulate the inliers
+        triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
+            lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
+        )
+        triangulated_points = triangulated_points.reshape(B, lo_num, 3)
+        tri_angles = tri_angles.reshape(B, lo_num, -1)
+        invalid_che_mask = invalid_che_mask.reshape(B, lo_num)
     return triangulated_points, tri_angles, invalid_che_mask