Spaces:
Paused
Paused
Commit
·
b19c7bf
1
Parent(s):
27a6ae5
solve
Browse files
app.py
CHANGED
|
@@ -42,7 +42,7 @@ def vggsfm_demo(
|
|
| 42 |
|
| 43 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 44 |
|
| 45 |
-
max_input_image =
|
| 46 |
|
| 47 |
target_dir = f"input_images_{timestamp}"
|
| 48 |
if os.path.exists(target_dir):
|
|
@@ -203,7 +203,7 @@ with gr.Blocks() as demo:
|
|
| 203 |
<li>upload the images (.jpg, .png, etc.), or </li>
|
| 204 |
<li>upload a video (.mp4, .mov, etc.) </li>
|
| 205 |
</ul>
|
| 206 |
-
<p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first
|
| 207 |
<p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
|
| 208 |
<p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
|
| 209 |
<p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
|
|
@@ -245,6 +245,7 @@ with gr.Blocks() as demo:
|
|
| 245 |
cache_examples=True,
|
| 246 |
)
|
| 247 |
|
|
|
|
| 248 |
submit_btn.click(
|
| 249 |
vggsfm_demo,
|
| 250 |
[input_video, input_images, num_query_images, num_query_points],
|
|
|
|
| 42 |
|
| 43 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 44 |
|
| 45 |
+
max_input_image = 25
|
| 46 |
|
| 47 |
target_dir = f"input_images_{timestamp}"
|
| 48 |
if os.path.exists(target_dir):
|
|
|
|
| 203 |
<li>upload the images (.jpg, .png, etc.), or </li>
|
| 204 |
<li>upload a video (.mp4, .mov, etc.) </li>
|
| 205 |
</ul>
|
| 206 |
+
<p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first 25 image frames. </p>
|
| 207 |
<p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
|
| 208 |
<p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
|
| 209 |
<p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
|
|
|
|
| 245 |
cache_examples=True,
|
| 246 |
)
|
| 247 |
|
| 248 |
+
|
| 249 |
submit_btn.click(
|
| 250 |
vggsfm_demo,
|
| 251 |
[input_video, input_images, num_query_images, num_query_points],
|
vggsfm_code/vggsfm/models/triangulator.py
CHANGED
|
@@ -323,7 +323,7 @@ class Triangulator(nn.Module):
|
|
| 323 |
# We adopt LORANSAC here again
|
| 324 |
|
| 325 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
| 326 |
-
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
| 327 |
)
|
| 328 |
|
| 329 |
# Determine valid tracks based on inlier numbers
|
|
|
|
| 323 |
# We adopt LORANSAC here again
|
| 324 |
|
| 325 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
| 326 |
+
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
| 327 |
)
|
| 328 |
|
| 329 |
# Determine valid tracks based on inlier numbers
|
vggsfm_code/vggsfm/utils/triangulation.py
CHANGED
|
@@ -755,7 +755,7 @@ def iterative_global_BA(
|
|
| 755 |
|
| 756 |
# triangulate tracks by LORANSAC
|
| 757 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
| 758 |
-
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
| 759 |
)
|
| 760 |
|
| 761 |
best_triangulated_points[valid_tracks] = points3D_opt
|
|
|
|
| 755 |
|
| 756 |
# triangulate tracks by LORANSAC
|
| 757 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
| 758 |
+
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
| 759 |
)
|
| 760 |
|
| 761 |
best_triangulated_points[valid_tracks] = points3D_opt
|
vggsfm_code/vggsfm/utils/triangulation_helpers.py
CHANGED
|
@@ -384,7 +384,7 @@ def generate_combinations(N):
|
|
| 384 |
return comb_array
|
| 385 |
|
| 386 |
|
| 387 |
-
def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50):
|
| 388 |
"""
|
| 389 |
Local Refinement for triangulation
|
| 390 |
"""
|
|
@@ -392,7 +392,6 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
|
|
| 392 |
batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
|
| 393 |
|
| 394 |
points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
|
| 395 |
-
extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
|
| 396 |
|
| 397 |
# The sets selected for local refinement
|
| 398 |
lo_indices = sorted_indices[:, :lo_num]
|
|
@@ -402,18 +401,38 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
|
|
| 402 |
lo_points1 = torch.zeros_like(points1_expand)
|
| 403 |
lo_points1[lo_mask] = points1_expand[lo_mask]
|
| 404 |
|
| 405 |
-
lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
|
| 406 |
-
lo_mask = lo_mask.reshape(B * lo_num, N)
|
| 407 |
-
lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
|
| 408 |
-
|
| 409 |
-
# triangulate the inliers
|
| 410 |
-
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
| 411 |
-
lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
|
| 412 |
-
)
|
| 413 |
|
| 414 |
-
triangulated_points = triangulated_points.reshape(B, lo_num, 3)
|
| 415 |
-
tri_angles = tri_angles.reshape(B, lo_num, -1)
|
| 416 |
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
return triangulated_points, tri_angles, invalid_che_mask
|
|
|
|
| 384 |
return comb_array
|
| 385 |
|
| 386 |
|
| 387 |
+
def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50, low_mem=True):
|
| 388 |
"""
|
| 389 |
Local Refinement for triangulation
|
| 390 |
"""
|
|
|
|
| 392 |
batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
|
| 393 |
|
| 394 |
points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
|
|
|
|
| 395 |
|
| 396 |
# The sets selected for local refinement
|
| 397 |
lo_indices = sorted_indices[:, :lo_num]
|
|
|
|
| 401 |
lo_points1 = torch.zeros_like(points1_expand)
|
| 402 |
lo_points1[lo_mask] = points1_expand[lo_mask]
|
| 403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
|
|
|
|
|
|
| 405 |
|
| 406 |
+
if low_mem:
|
| 407 |
+
all_triangulated_points = []
|
| 408 |
+
all_tri_angles = []
|
| 409 |
+
all_invalid_che_mask = []
|
| 410 |
+
|
| 411 |
+
for loidx in range(lo_num):
|
| 412 |
+
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
| 413 |
+
extrinsics, lo_points1[:, loidx], mask=lo_mask[:, loidx], compute_tri_angle=True, check_cheirality=True
|
| 414 |
+
)
|
| 415 |
+
# Append the outputs to the respective lists
|
| 416 |
+
all_triangulated_points.append(triangulated_points[:, None])
|
| 417 |
+
all_tri_angles.append(tri_angles[:, None])
|
| 418 |
+
all_invalid_che_mask.append(invalid_che_mask[:,None])
|
| 419 |
+
|
| 420 |
+
triangulated_points = torch.cat(all_triangulated_points, dim=1)
|
| 421 |
+
tri_angles = torch.cat(all_tri_angles, dim=1)
|
| 422 |
+
invalid_che_mask = torch.cat(all_invalid_che_mask, dim=1)
|
| 423 |
+
else:
|
| 424 |
+
extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
|
| 425 |
+
lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
|
| 426 |
+
lo_mask = lo_mask.reshape(B * lo_num, N)
|
| 427 |
+
lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
|
| 428 |
+
|
| 429 |
+
# triangulate the inliers
|
| 430 |
+
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
| 431 |
+
lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
triangulated_points = triangulated_points.reshape(B, lo_num, 3)
|
| 435 |
+
tri_angles = tri_angles.reshape(B, lo_num, -1)
|
| 436 |
+
invalid_che_mask = invalid_che_mask.reshape(B, lo_num)
|
| 437 |
|
| 438 |
return triangulated_points, tri_angles, invalid_che_mask
|