Spaces:

neonwatty
/

meme_search

Running

App Files Files Community

Jeremy Watt commited on Sep 26, 2024

Commit

bc066dc

1 Parent(s): b9a532a

reset

Browse files

Files changed (24) hide show

data/dbs/memes.db +0 -0
data/dbs/memes.faiss +0 -0
data/dbs/placeholder +0 -0
data/input/test_meme_1.jpg +0 -0
data/input/test_meme_2.jpg +0 -0
data/input/test_meme_3.jpg +0 -0
data/input/test_meme_4.jpg +0 -0
data/input/test_meme_5.jpg +0 -0
data/input/test_meme_6.jpg +0 -0
data/input/test_meme_7.jpg +0 -0
data/input/test_meme_8.jpg +0 -0
data/input/test_meme_9.jpg +0 -0
meme_search/__init__.py +0 -10
meme_search/app.py +0 -34
meme_search/data_puller.py +0 -114
meme_search/style.css +0 -15
meme_search/utilities/__init__.py +0 -10
meme_search/utilities/__pycache__/__init__.cpython-310.pyc +0 -0
meme_search/utilities/__pycache__/query.cpython-310.pyc +0 -0
meme_search/utilities/chunks.py +0 -68
meme_search/utilities/create.py +0 -75
meme_search/utilities/imgs.py +0 -19
meme_search/utilities/query.py +0 -76
meme_search/utilities/text_extraction.py +0 -38

data/dbs/memes.db DELETED Viewed

Binary file (28.7 kB)

data/dbs/memes.faiss DELETED Viewed

Binary file (393 kB)

data/dbs/placeholder DELETED Viewed

File without changes

data/input/test_meme_1.jpg DELETED Viewed

Binary file (77.1 kB)

data/input/test_meme_2.jpg DELETED Viewed

Binary file (64.6 kB)

data/input/test_meme_3.jpg DELETED Viewed

Binary file (8.13 kB)

data/input/test_meme_4.jpg DELETED Viewed

Binary file (12.7 kB)

data/input/test_meme_5.jpg DELETED Viewed

Binary file (14.5 kB)

data/input/test_meme_6.jpg DELETED Viewed

Binary file (65 kB)

data/input/test_meme_7.jpg DELETED Viewed

Binary file (105 kB)

data/input/test_meme_8.jpg DELETED Viewed

Binary file (43.9 kB)

data/input/test_meme_9.jpg DELETED Viewed

Binary file (37.7 kB)

meme_search/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-import os
-base_dir = os.path.dirname(os.path.abspath(__file__))
-meme_search_root_dir = os.path.dirname(base_dir)
-vector_db_path = meme_search_root_dir + "/data/dbs/memes.faiss"
-sqlite_db_path = meme_search_root_dir + "/data/dbs/memes.db"
-from meme_search.data_puller import pull_demo_data
-pull_demo_data()

meme_search/app.py DELETED Viewed

@@ -1,34 +0,0 @@
-from meme_search import base_dir, sqlite_db_path, vector_db_path
-from meme_search.utilities.query import complete_query
-import streamlit as st
-st.set_page_config(page_title="Meme Search")
-# search bar taken from --> https://discuss.streamlit.io/t/creating-a-nicely-formatted-search-field/1804/2
-def local_css(file_name):
-    with open(file_name) as f:
-        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
-def remote_css(url):
-    st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)
-local_css(base_dir + "/style.css")
-remote_css("https://fonts.googleapis.com/icon?family=Material+Icons")
-# icon("search")
-buff, col, buff2 = st.columns([1, 4, 1])
-selected = col.text_input(label="search for meme", placeholder="search for a meme")
-if selected:
-    results = complete_query(selected, vector_db_path, sqlite_db_path)
-    img_paths = [v["img_path"] for v in results]
-    for result in results:
-        with col.container(border=True):
-            st.image(
-                result["img_path"],
-                output_format="auto",
-                caption=f'{result["full_description"]} (query distance = {result["distance"]})',
-            )

meme_search/data_puller.py DELETED Viewed

@@ -1,114 +0,0 @@
-import os
-import requests
-def download_file_from_github(repo_url, file_path, save_dir):
-    raw_url = f"https://raw.githubusercontent.com/{repo_url}/main/{file_path}"
-    response = requests.get(raw_url)
-    if response.status_code == 200:
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-        save_path = os.path.join(save_dir, os.path.basename(file_path))
-        with open(save_path, 'wb') as file:
-            file.write(response.content)
-        print(f"File downloaded successfully: {save_path}")
-    else:
-        print(f"Failed to download file. Status code: {response.status_code}")
-def list_files_in_github_directory(owner, repo, directory_path):
-    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{directory_path}"
-    response = requests.get(url)
-    if response.status_code == 200:
-        files = response.json()
-        names = []
-        for file in files:
-            names.append(file["name"])
-        return names
-    else:
-        print(f"Failed to retrieve directory contents. Status code: {response.status_code}")
-def collect_repo_file_names():
-    owner = "neonwatty"
-    repo = "meme_search"
-    input_path = "/data/input"
-    input_names = list_files_in_github_directory(owner, repo, input_path)
-    db_path = "/data/dbs"
-    db_names = list_files_in_github_directory(owner, repo, db_path)
-    db_names = [v for v in db_names if ".db" in v or ".faiss" in v]
-    return input_path, db_path, input_names, db_names
-def check_directory_exists(directory_path):
-    return os.path.isdir(directory_path)
-def create_directory(directory_path):
-    try:
-        os.makedirs(directory_path, exist_ok=True)
-        print(f"Directory '{directory_path}' created successfully.")
-    except OSError as error:
-        print(f"Error creating directory '{directory_path}': {error}")
-def check_files_in_directory(directory_path, file_list):
-    missing_files = []
-    for file_name in file_list:
-        if not os.path.isfile(os.path.join(directory_path, file_name)):
-            missing_files.append(file_name)
-    return missing_files
-def list_files_in_directory(directory_path):
-    try:
-        files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
-        return files
-    except OSError as error:
-        print(f"Error accessing directory '{directory_path}': {error}")
-        return []
-def delete_file(directory_path, file_name):
-    try:
-        file_path = os.path.join(directory_path, file_name)
-        if os.path.isfile(file_path):
-            os.remove(file_path)
-            print(f"File '{file_name}' deleted successfully.")
-        else:
-            print(f"File '{file_name}' does not exist in the directory '{directory_path}'.")
-    except OSError as error:
-        print(f"Error deleting file '{file_name}': {error}")
-def pull_demo_data():
-    repo_url = "neonwatty/meme_search"
-    input_path, db_path, repo_input_names, repo_db_names = collect_repo_file_names()
-    if not check_directory_exists("." + input_path):
-        create_directory("." + input_path)
-        for name in repo_input_names:
-            file_path = input_path + "/" + name
-            download_file_from_github(repo_url, file_path, "." + input_path)
-    else:
-        local_input_files = list_files_in_directory("." + input_path)
-        input_files_to_pull = [item for item in repo_input_names if item not in local_input_files]
-        input_files_to_delete = [item for item in local_input_files if item not in repo_input_names]
-        for name in input_files_to_delete:
-            delete_file("." + input_path, name)
-        for name in input_files_to_pull:
-            file_path = input_path + "/" + name
-            download_file_from_github(repo_url, file_path, "." + input_path)
-    if not check_directory_exists("." + db_path):
-        create_directory("." + db_path)
-        repo_url = "neonwatty/meme_search"
-        for name in repo_db_names:
-            file_path = db_path + "/" + name
-            download_file_from_github(repo_url, file_path, "." + db_path)
-    else:
-        local_db_files = list_files_in_directory("." + db_path)
-        db_files_to_pull = [item for item in repo_db_names if item not in local_db_files]
-        db_files_to_delete = [item for item in local_db_files if item not in repo_db_names]
-        for name in db_files_to_delete:
-            delete_file("." + db_path, name)
-        for name in db_files_to_pull:
-            file_path = db_path + "/" + name
-            download_file_from_github(repo_url, file_path, "." + db_path)

meme_search/style.css DELETED Viewed

@@ -1,15 +0,0 @@
-body {
-    color: #fff;
-    background-color: #4F8BF9;
-}
-.stButton>button {
-    color: #4F8BF9;
-    border-radius: 50%;
-    height: 3em;
-    width: 3em;
-}
-.stTextInput>div>div>input {
-    color: #4F8BF9;
-}

meme_search/utilities/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-import os
-from sentence_transformers import SentenceTransformer
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-utilities_base_dir = os.path.dirname(os.path.abspath(__file__))
-meme_search_dir = os.path.dirname(utilities_base_dir)
-meme_search_root_dir = os.path.dirname(meme_search_dir)
-vector_db_path = meme_search_root_dir + "/data/dbs/memes.faiss"
-sqlite_db_path = meme_search_root_dir + "/data/dbs/memes.db"

meme_search/utilities/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (538 Bytes)

meme_search/utilities/__pycache__/query.cpython-310.pyc DELETED Viewed

Binary file (2.66 kB)

meme_search/utilities/chunks.py DELETED Viewed

@@ -1,68 +0,0 @@
-import re
-def clean_word(text: str) -> str:
-    # clean input text - keeping only lower case letters, numbers, punctuation, and single quote symbols
-    return re.sub(" +", " ", re.compile("[^a-z0-9,.!?']").sub(" ", text.lower().strip()))
-def chunk_text(text: str) -> list:
-    # split and clean input text
-    text_split = clean_word(text).split(" ")
-    text_split = [v for v in text_split if len(v) > 0]
-    # use two pointers to create chunks
-    chunk_size = 4
-    overlap_size = 2
-    # create next chunk by moving right pointer until chunk_size is reached or line_number changes by more than 1 or end of word_sequence is reached
-    left_pointer = 0
-    right_pointer = chunk_size - 1
-    chunks = []
-    if right_pointer >= len(text_split):
-        chunks = [" ".join(text_split)]
-    else:
-        while right_pointer < len(text_split):
-            # check if chunk_size has been reached
-            # create chunk
-            chunk = text_split[left_pointer : right_pointer + 1]
-            # move left pointer
-            left_pointer += chunk_size - overlap_size
-            # move right pointer
-            right_pointer += chunk_size - overlap_size
-            # store chunk
-            chunks.append(" ".join(chunk))
-        # check if there is final chunk
-        if len(text_split[left_pointer:]) > 0:
-            last_chunk = text_split[left_pointer:]
-            chunks.append(" ".join(last_chunk))
-    # insert the full text
-    if len(chunks) > 1:
-        chunks.insert(0, text.lower())
-    return chunks
-# loop over each meme's moondream based text descriptor and create a short dict containing its full and chunked text
-def create_all_img_chunks(img_paths: list, answers: list) -> list:
-    try:
-        print("STARTING: create_all_img_chunks")
-        img_chunks = []
-        for ind, img_path in enumerate(img_paths):
-            moondream_meme_text = answers[ind]
-            moondream_chunks = chunk_text(moondream_meme_text)
-            for chunk in moondream_chunks:
-                entry = {}
-                entry["img_path"] = img_path
-                entry["chunk"] = chunk
-                img_chunks.append(entry)
-        print("SUCCESS: create_all_img_chunks ran successfully")
-        return img_chunks
-    except Exception as e:
-        print(f"FAILURE: create_all_img_chunks failed with exception {e}")
-        raise e

meme_search/utilities/create.py DELETED Viewed

@@ -1,75 +0,0 @@
-import sqlite3
-import faiss
-from meme_search.utilities import model
-from meme_search.utilities.imgs import collect_img_paths
-from meme_search.utilities.text_extraction import extract_text_from_imgs
-from meme_search.utilities.chunks import create_all_img_chunks
-from meme_search.utilities import vector_db_path, sqlite_db_path
-def create_chunk_db(img_chunks: list, db_filepath: str) -> None:
-    # Create a lookup table for chunks
-    conn = sqlite3.connect(db_filepath)
-    cursor = conn.cursor()
-    # Create the table  - delete old table if it exists
-    cursor.execute("DROP TABLE IF EXISTS chunks_reverse_lookup")
-    # Create the table - alias rowid as chunk_index
-    cursor.execute("""
-        CREATE TABLE IF NOT EXISTS chunks_reverse_lookup (
-            chunk_index INTEGER PRIMARY KEY,
-            img_path TEXT,
-            chunk TEXT
-        );
-    """)
-    # Insert data into the table
-    for chunk_index, entry in enumerate(img_chunks):
-        img_path = entry["img_path"]
-        chunk = entry["chunk"]
-        cursor.execute(
-            "INSERT INTO chunks_reverse_lookup (chunk_index, img_path, chunk) VALUES (?, ?, ?)",
-            (chunk_index, img_path, chunk),
-        )
-    conn.commit()
-    conn.close()
-def create_vector_db(chunks: list, db_file_path: str) -> None:
-    # embed inputs
-    embeddings = model.encode(chunks)
-    # dump all_embeddings to faiss index
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    # write index to disk
-    faiss.write_index(index, db_file_path)
-def complete_create_dbs(img_chunks: list, vector_db_path: str, sqlite_db_path: str) -> None:
-    try:
-        print("STARTING: complete_create_dbs")
-        # create db for img_chunks
-        create_chunk_db(img_chunks, sqlite_db_path)
-        # create vector embedding db for chunks
-        chunks = [v["chunk"] for v in img_chunks]
-        create_vector_db(chunks, vector_db_path)
-        print("SUCCESS: complete_create_dbs succeeded")
-    except Exception as e:
-        print(f"FAILURE: complete_create_dbs failed with exception {e}")
-def process():
-    all_img_paths = collect_img_paths()
-    moondream_answers = extract_text_from_imgs(all_img_paths)
-    img_chunks = create_all_img_chunks(all_img_paths, moondream_answers)
-    complete_create_dbs(img_chunks, vector_db_path, sqlite_db_path)
-if __name__ == "__main__":
-    process()

meme_search/utilities/imgs.py DELETED Viewed

@@ -1,19 +0,0 @@
-import os
-from meme_search.utilities import meme_search_root_dir
-allowable_extensions = ["jpg", "jpeg", "png"]
-def collect_img_paths() -> list:
-    try:
-        img_dir = meme_search_root_dir + "/data/input"
-        print("STARTING: collect_img_paths")
-        all_img_paths = [os.path.join(img_dir, name) for name in os.listdir(img_dir) if name.split(".")[-1] in allowable_extensions]
-        all_img_paths = sorted(all_img_paths)
-        print(f"SUCCESS: collect_img_paths ran successfully - image paths loaded from '{img_dir}'")
-        return all_img_paths
-    except Exception as e:
-        print(f"FAILURE: collect_img_paths failed with img_dir {img_dir} with exception {e}")
-        raise e

meme_search/utilities/query.py DELETED Viewed

@@ -1,76 +0,0 @@
-import faiss
-import sqlite3
-import numpy as np
-from typing import Tuple
-import argparse
-from meme_search.utilities import model
-from meme_search.utilities import vector_db_path, sqlite_db_path
-def query_vector_db(query: str, db_file_path: str, k: int = 10) -> Tuple[list, list]:
-    # connect to db
-    faiss_index = faiss.read_index(db_file_path)
-    # test
-    encoded_query = np.expand_dims(model.encode(query), axis=0)
-    # query db
-    distances, indices = faiss_index.search(encoded_query, k)
-    distances = distances.tolist()[0]
-    indices = indices.tolist()[0]
-    return distances, indices
-def query_sqlite_db(indices: list, db_filepath: str) -> list:
-    conn = sqlite3.connect(db_filepath)
-    cursor = conn.cursor()
-    query = f"SELECT * FROM chunks_reverse_lookup WHERE chunk_index IN {tuple(indices)}"
-    cursor.execute(query)
-    rows = cursor.fetchall()
-    rows = [{"index": row[0], "img_path": row[1], "chunk": row[2]} for row in rows]
-    rows = sorted(rows, key=lambda x: indices.index(x["index"]))  # re-sort rows according to input indices
-    for row in rows:
-        query = f"SELECT * FROM chunks_reverse_lookup WHERE chunk_index=(SELECT MIN(chunk_index) FROM chunks_reverse_lookup WHERE img_path='{row['img_path']}')"
-        cursor.execute(query)
-        full_description_row = cursor.fetchall()
-        row["full_description"] = full_description_row[0][2]
-    conn.close()
-    return rows
-def complete_query(query: str, vector_db_path: str, sqlite_db_path: str, k: int = 10) -> list:
-    try:
-        print("STARTING: complete_query")
-        # query vector_db, first converting input query to embedding
-        distances, indices = query_vector_db(query, vector_db_path, k=k)
-        # use indices to query sqlite db containing chunk data
-        img_chunks = query_sqlite_db(indices, sqlite_db_path)  # bump up indices by 1 since sqlite row index starts at 1 not 0
-        # map indices back to correct image in img_chunks
-        imgs_seen = []
-        unique_img_entries = []
-        for ind, entry in enumerate(img_chunks):
-            if entry["img_path"] in imgs_seen:
-                continue
-            else:
-                entry["distance"] = round(distances[ind], 2)
-                unique_img_entries.append(entry)
-                imgs_seen.append(entry["img_path"])
-        print("SUCCESS: complete_query succeeded")
-        return unique_img_entries
-    except Exception as e:
-        print(f"FAILURE: complete_query failed with exception {e}")
-        raise e
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--query", dest="query", type=str, help="Add query")
-    args = parser.parse_args()
-    query = args.query
-    print(query)
-    results = complete_query(query, vector_db_path, sqlite_db_path)
-    print(results)

meme_search/utilities/text_extraction.py DELETED Viewed

@@ -1,38 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from PIL import Image
-import transformers
-transformers.logging.set_verbosity_error()
-def prompt_moondream(img_path: str, prompt: str) -> str:
-    # copied from moondream demo readme --> https://github.com/vikhyat/moondream/tree/main
-    model_id = "vikhyatk/moondream2"
-    revision = "2024-05-20"
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        trust_remote_code=True,
-        revision=revision,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
-    image = Image.open(img_path)
-    enc_image = model.encode_image(image)
-    moondream_response = model.answer_question(enc_image, prompt, tokenizer)
-    return moondream_response
-def extract_text_from_imgs(img_paths: list) -> list:
-    try:
-        print("STARTING: extract_text_from_imgs")
-        prompt = "Describe this image."
-        answers = []
-        for img_path in img_paths:
-            print(f"INFO: prompting moondream for a description of image: '{img_path}'")
-            answer = prompt_moondream(img_path, prompt)
-            answers.append(answer)
-            print("DONE!")
-        print("SUCCESS: extract_text_from_imgs succeeded")
-        return answers
-    except Exception as e:
-        print(f"FAILURE: extract_text_from_imgs failed with exception {e}")
-        raise e