Spaces:

quasara-io
/

Semantic-Search-Frontend

Build error

App Files Files Community

inie2003 commited on Oct 20, 2024

Commit

1a36398

verified ·

1 Parent(s): a3902aa

Added small object search

Browse files

Files changed (1) hide show

helper.py +154 -21

helper.py CHANGED Viewed

@@ -8,8 +8,11 @@ import torch.nn as nn
 import boto3
 import streamlit as st
 from PIL import Image
 from io import BytesIO
 from typing import List, Union
 # Initialize the model globally to avoid reloading each time
@@ -21,12 +24,10 @@ tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')
 def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
     """
     Encode the query using the OpenCLIP model.
     Parameters
     ----------
     query : Union[str, Image.Image]
         The query, which can be a text string or an Image object.
     Returns
     -------
     torch.Tensor
@@ -45,21 +46,49 @@ def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
     return query_embedding
-def load_hf_datasets(dataset_name):
     """
     Load Datasets from Hugging Face as DF
     ---------------------------------------
     dataset_name: str - name of dataset on Hugging Face
     ---------------------------------------
     RETURNS: dataset as pandas dataframe
     """
-    dataset = load_dataset(f"quasara-io/{dataset_name}")
-    # Access only the 'Main' split
-    main_dataset = dataset['Main_1']
-    # Convert to Pandas DataFrame
-    df = main_dataset.to_pandas()
     return df
 def get_image_vectors(df):
     # Get the image vectors from the dataframe
@@ -67,7 +96,7 @@ def get_image_vectors(df):
     return torch.tensor(image_vectors, dtype=torch.float32)
-def search(query, df, limit, offset, scoring_func, search_in_images, search_in_small_objects):
     if search_in_images:
         # Encode the image query
         query_vector = encode_query(query)
@@ -79,7 +108,7 @@ def search(query, df, limit, offset, scoring_func, search_in_images, search_in_s
         # Calculate the cosine similarity between the query vector and each image vector
         query_vector = query_vector[0, :].detach().numpy()  # Detach and convert to a NumPy array
-        image_vectors = image_vectors.detach().numpy()  # Convert the image vectors to a NumPy array
         cosine_similarities = cosine_similarity([query_vector], image_vectors)
         # Get the top K indices of the most similar image vectors
@@ -88,6 +117,29 @@ def search(query, df, limit, offset, scoring_func, search_in_images, search_in_s
         # Return the top K indices
         return top_k_indices
 def get_file_paths(df, top_k_indices, column_name = 'File_Path'):
     """
     Retrieve the file paths (or any specific column) from the DataFrame using the top K indices.
@@ -97,6 +149,21 @@ def get_file_paths(df, top_k_indices, column_name = 'File_Path'):
     - top_k_indices: numpy array of the top K indices
     - column_name: str, the name of the column to fetch (e.g., 'ImagePath')
     Returns:
     - top_k_paths: list of file paths or values from the specified column
     """
@@ -104,8 +171,7 @@ def get_file_paths(df, top_k_indices, column_name = 'File_Path'):
     top_k_paths = df.iloc[top_k_indices][column_name].tolist()
     return top_k_paths
-def get_images_from_s3_to_display(bucket_name, file_paths, AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,  folder_name= None):
     """
     Retrieve and display images from AWS S3 in a Streamlit app.
@@ -135,21 +201,88 @@ def get_images_from_s3_to_display(bucket_name, file_paths, AWS_ACCESS_KEY_ID,AWS
 def main():
-    dataset_name = "StopSign_test"
     query = "black car"
     limit = 10
     offset = 0
     scoring_func = "cosine"
     search_in_images = True
-    search_in_small_objects = False
-    df = load_hf_datasets(dataset_name)
-    results = search(query, df, limit, offset, scoring_func, search_in_images, search_in_small_objects)
-    top_k_paths = get_file_paths(df,results)
-    return top_k_paths
 if __name__ == "__main__":
     main()

 import boto3
 import streamlit as st
 from PIL import Image
+from PIL import ImageDraw
 from io import BytesIO
+import pandas as pd
 from typing import List, Union
+import concurrent.futures
 # Initialize the model globally to avoid reloading each time
 def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
     """
     Encode the query using the OpenCLIP model.
     Parameters
     ----------
     query : Union[str, Image.Image]
         The query, which can be a text string or an Image object.
     Returns
     -------
     torch.Tensor
     return query_embedding
+def load_hf_datasets(key,dataset):
     """
     Load Datasets from Hugging Face as DF
     ---------------------------------------
     dataset_name: str - name of dataset on Hugging Face
     ---------------------------------------
     RETURNS: dataset as pandas dataframe
     """
+    df = dataset[key].to_pandas()
     return df
+def parallel_load_and_combine(dataset_keys, dataset):
+    """
+    Load datasets in parallel and combine Main and Split keys
+    ----------------------------------------------------------
+    dataset_keys: list - keys of the dataset (e.g., ['Main_1', 'Split_1', ...])
+    dataset: DatasetDict - the loaded Hugging Face dataset
+    ----------------------------------------------------------
+    RETURNS: combined DataFrame from both Main and Split keys
+    """
+    # Separate keys into Main and Split lists
+    main_keys = [key for key in dataset_keys if key.startswith('Main')]
+    split_keys = [key for key in dataset_keys if key.startswith('Split')]
+    def process_key(key, key_type):
+        df = load_hf_datasets(key, dataset)
+        return df
+    # Parallel loading of Main keys
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        main_dfs = list(executor.map(lambda key: process_key(key, 'Main'), main_keys))
+    # Parallel loading of Split keys
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        split_dfs = list(executor.map(lambda key: process_key(key, 'Split'), split_keys))
+    # Combine Main DataFrames and Split DataFrames
+    main_combined_df = pd.concat(main_dfs, ignore_index=True) if main_dfs else pd.DataFrame()
+    split_combined_df = pd.concat(split_dfs, ignore_index=True) if split_dfs else pd.DataFrame()
+    return main_combined_df, split_combined_df
 def get_image_vectors(df):
     # Get the image vectors from the dataframe
     return torch.tensor(image_vectors, dtype=torch.float32)
+def search(query, df, limit, offset, scoring_func, search_in_images):
     if search_in_images:
         # Encode the image query
         query_vector = encode_query(query)
         # Calculate the cosine similarity between the query vector and each image vector
         query_vector = query_vector[0, :].detach().numpy()  # Detach and convert to a NumPy array
+        image_vectors = image_vectoßrs.detach().numpy()  # Convert the image vectors to a NumPy array
         cosine_similarities = cosine_similarity([query_vector], image_vectors)
         # Get the top K indices of the most similar image vectors
         # Return the top K indices
         return top_k_indices
+#Try Batch Search
+def batch_search(query, df, batch_size=100000, limit=10):
+    top_k_indices = []
+    # Get the image vectors from the dataframe and ensure they are NumPy arrays
+    vectors = get_image_vectors(df).numpy()  # Convert to NumPy array if it's a tensor
+    # Encode the query and ensure it's a NumPy array
+    query_vector = encode_query(query)[0].detach().numpy()  # Assuming the first element is the query embedding
+    # Iterate over the batches and compute cosine similarities
+    for i in range(0, len(vectors), batch_size):
+        batch_vectors = vectors[i:i + batch_size]  # Extract a batch of vectors
+        # Compute cosine similarity between the query vector and the batch
+        batch_similarities = cosine_similarity([query_vector], batch_vectors)
+        # Get the top-k similar vectors within this batch
+        top_k_indices.extend(np.argsort(-batch_similarities[0])[:limit])
+    return top_k_indices
 def get_file_paths(df, top_k_indices, column_name = 'File_Path'):
     """
     Retrieve the file paths (or any specific column) from the DataFrame using the top K indices.
     - top_k_indices: numpy array of the top K indices
     - column_name: str, the name of the column to fetch (e.g., 'ImagePath')
+    Returns:
+    - top_k_paths: list of file paths or values from the specified column
+    """
+    # Fetch the specific column corresponding to the top K indices
+    top_k_paths = df.iloc[top_k_indices][column_name].tolist()
+    return top_k_paths
+def get_cordinates(df, top_k_indices, column_name = 'Coordinate'):
+    """
+    Retrieve the file paths (or any specific column) from the DataFrame using the top K indices.
+    Parameters:
+    - df: pandas DataFrame containing the data
+    - top_k_indices: numpy array of the top K indices
+    - column_name: str, the name of the column to fetch (e.g., 'ImagePath')
     Returns:
     - top_k_paths: list of file paths or values from the specified column
     """
     top_k_paths = df.iloc[top_k_indices][column_name].tolist()
     return top_k_paths
+def get_images_from_s3_to_display(bucket_name, file_paths, AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,  folder_name):
     """
     Retrieve and display images from AWS S3 in a Streamlit app.
+def get_images_with_bounding_boxes_from_s3(bucket_name, file_paths, bounding_boxes, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, folder_name):
+    """
+    Retrieve and display images from AWS S3 with corresponding bounding boxes in a Streamlit app.
+    Parameters:
+    - bucket_name: str, the name of the S3 bucket
+    - file_paths: list, a list of file paths to retrieve from S3
+    - bounding_boxes: list of numpy arrays or lists, each containing coordinates of bounding boxes (in the form [x_min, y_min, x_max, y_max])
+    - AWS_ACCESS_KEY_ID: str, AWS access key ID for authentication
+    - AWS_SECRET_ACCESS_KEY: str, AWS secret access key for authentication
+    - folder_name: str, the folder prefix in S3 bucket where the images are stored
+    Returns:
+    - None (directly displays images in the Streamlit app with bounding boxes)
+    """
+    # Initialize S3 client
+    s3 = boto3.client(
+        's3',
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+    )
+    # Iterate over file paths and corresponding bounding boxes
+    for file_path, box_coords in zip(file_paths, bounding_boxes):
+        # Retrieve the image from S3
+        s3_object = s3.get_object(Bucket=bucket_name, Key=f"{folder_name}{file_path}")
+        img_data = s3_object['Body'].read()
+        # Open the image using PIL
+        img = Image.open(BytesIO(img_data))
+        # Draw bounding boxes on the image
+        draw = ImageDraw.Draw(img)
+        # Ensure box_coords is iterable, in case it's a single numpy array or float value
+        if isinstance(box_coords, (np.ndarray, list)):
+            # Check if we have multiple bounding boxes or a single one
+            if len(box_coords) > 0 and isinstance(box_coords[0], (np.ndarray, list)):
+                # Multiple bounding boxes
+                for box in box_coords:
+                    x_min, y_min, x_max, y_max = map(int, box)  # Convert to integers
+                    draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=3)
+            else:
+                # Single bounding box
+                x_min, y_min, x_max, y_max = map(int, box_coords)
+                draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=3)
+        else:
+            raise ValueError(f"Bounding box data for {file_path} is not in an iterable format.")
+        # Display the image with bounding boxes using Streamlit
+        st.image(img, caption=file_path, use_column_width=True)
 def main():
+    print('Begin Main')
+    dataset_name = "WayveScenes"
     query = "black car"
     limit = 10
     offset = 0
     scoring_func = "cosine"
     search_in_images = True
+    search_in_small_objects = True
+    dataset = load_dataset(f"quasara-io/{dataset_name}")
+    print('loaded dataset')
+    dataset_keys = dataset.keys()
+    main_df, split_df = parallel_load_and_combine(dataset_keys, dataset)
+    #Now we get the coordinates and the stuff
+    print('processed datasets')
+    if search_in_small_objects:
+        results = batch_search(query, split_df)
+        print(results)
+        top_k_paths = get_file_paths(split_df,results)
+        top_k_cordinates = get_cordinates(split_df, results)
+        print(top_k_paths)
+        print(top_k_cordinates)
+        return top_k_paths, top_k_cordinates
+    else:
+        results = search(query, main_df, limit, offset, scoring_func, search_in_images)
+        top_k_paths = get_file_paths(main_df,results)
+        print(top_k_paths)
+        return top_k_paths
 if __name__ == "__main__":
     main()