Spaces:
Build error
Build error
Updated load hugging face dataset function
Browse files
helper.py
CHANGED
|
@@ -47,20 +47,23 @@ def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
|
|
| 47 |
|
| 48 |
def load_hf_datasets(dataset_name):
|
| 49 |
"""
|
| 50 |
-
Load
|
| 51 |
-
|
| 52 |
dataset_name: str - name of dataset on Hugging Face
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
RETURNS: dataset as pandas dataframe
|
| 56 |
"""
|
| 57 |
dataset = load_dataset(f"quasara-io/{dataset_name}")
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def get_image_vectors(df):
|
| 65 |
# Get the image vectors from the dataframe
|
| 66 |
image_vectors = np.vstack(df['Vector'].to_numpy())
|
|
|
|
| 47 |
|
| 48 |
def load_hf_datasets(dataset_name):
|
| 49 |
"""
|
| 50 |
+
Load all splits containing 'Main' from a Hugging Face dataset as a DataFrame
|
| 51 |
+
---------------------------------------------------------------------------
|
| 52 |
dataset_name: str - name of dataset on Hugging Face
|
| 53 |
+
---------------------------------------------------------------------------
|
| 54 |
+
RETURNS: concatenated dataset as a pandas DataFrame
|
|
|
|
| 55 |
"""
|
| 56 |
dataset = load_dataset(f"quasara-io/{dataset_name}")
|
| 57 |
+
|
| 58 |
+
# Filter splits that contain the word 'Main'
|
| 59 |
+
main_splits = [split for split in dataset if 'Main' in split]
|
| 60 |
+
|
| 61 |
+
# Load and concatenate all splits containing 'Main' into a single DataFrame
|
| 62 |
+
df_list = [dataset[split].to_pandas() for split in main_splits]
|
| 63 |
+
combined_df = pd.concat(df_list, ignore_index=True)
|
| 64 |
+
|
| 65 |
+
return combined_df
|
| 66 |
+
|
| 67 |
def get_image_vectors(df):
|
| 68 |
# Get the image vectors from the dataframe
|
| 69 |
image_vectors = np.vstack(df['Vector'].to_numpy())
|