Henry65
/

RepoSim4Py

@@ -126,8 +126,8 @@ def extract_information(repos, headers=None):
                         )
                     except SyntaxError as e:
                         tqdm.write(f"[-] SyntaxError in {member.name}, skipping: \n{e}")
                 elif (member.name.endswith("README.md") or member.name.endswith("README.rst")) and member.isfile():
-                    # 3. Extracting readme.
                     try:
                         file_content = tar.extractfile(member).read().decode("utf-8")
                         # extract readme
@@ -140,8 +140,8 @@ def extract_information(repos, headers=None):
                         )
                     except SyntaxError as e:
                         tqdm.write(f"[-] SyntaxError in {member.name}, skipping: \n{e}")
                 elif member.name.endswith("requirements.txt") and member.isfile():
-                    # 4. Extracting requirements.
                     try:
                         lines = tar.extractfile(member).readlines().decode("utf-8")
                         # extract readme
@@ -290,25 +290,26 @@ class RepoPipeline(Pipeline):
                 tqdm.write(f"[*] Generating code embeddings for {repo_name}")
                 code_embeddings = self.generate_embeddings(repo_info["codes"], max_length)
                 info["code_embeddings"] = code_embeddings.cpu().numpy()
-                info["mean_code_embedding"] = torch.mean(code_embeddings, dim=0).cpu().numpy()
                 # Doc embeddings
                 tqdm.write(f"[*] Generating doc embeddings for {repo_name}")
                 doc_embeddings = self.generate_embeddings(repo_info["docs"], max_length)
                 info["doc_embeddings"] = doc_embeddings.cpu().numpy()
-                info["mean_doc_embedding"] = torch.mean(doc_embeddings, dim=0).cpu().numpy()
                 # Requirement embeddings
                 tqdm.write(f"[*] Generating requirement embeddings for {repo_name}")
                 requirement_embeddings = self.generate_embeddings(repo_info["requirements"], max_length)
                 info["requirement_embeddings"] = requirement_embeddings.cpu().numpy()
-                info["mean_requirement_embedding"] = torch.mean(requirement_embeddings, dim=0).cpu().numpy()
                 # Readme embeddings
                 tqdm.write(f"[*] Generating readme embeddings for {repo_name}")
                 readme_embeddings = self.generate_embeddings(repo_info["readmes"], max_length)
                 info["readme_embeddings"] = readme_embeddings.cpu().numpy()
-                info["mean_readme_embedding"] = torch.mean(readme_embeddings, dim=0).cpu().numpy()
                 # Repo-level mean embedding
                 info["mean_repo_embedding"] = np.concatenate([
@@ -316,13 +317,16 @@ class RepoPipeline(Pipeline):
                     info["mean_doc_embedding"],
                     info["mean_requirement_embedding"],
                     info["mean_readme_embedding"]
-                ], axis=0)
-                # TODO Remove test
                 info["code_embeddings_shape"] = info["code_embeddings"].shape
                 info["doc_embeddings_shape"] = info["doc_embeddings"].shape
                 info["requirement_embeddings_shape"] = info["requirement_embeddings"].shape
                 info["readme_embeddings_shape"] = info["readme_embeddings"].shape
                 info["mean_repo_embedding_shape"] = info["mean_repo_embedding"].shape
                 progress_bar.update(1)

                         )
                     except SyntaxError as e:
                         tqdm.write(f"[-] SyntaxError in {member.name}, skipping: \n{e}")
+                # 3. Extracting readme.
                 elif (member.name.endswith("README.md") or member.name.endswith("README.rst")) and member.isfile():
                     try:
                         file_content = tar.extractfile(member).read().decode("utf-8")
                         # extract readme
                         )
                     except SyntaxError as e:
                         tqdm.write(f"[-] SyntaxError in {member.name}, skipping: \n{e}")
+                # 4. Extracting requirements.
                 elif member.name.endswith("requirements.txt") and member.isfile():
                     try:
                         lines = tar.extractfile(member).readlines().decode("utf-8")
                         # extract readme
                 tqdm.write(f"[*] Generating code embeddings for {repo_name}")
                 code_embeddings = self.generate_embeddings(repo_info["codes"], max_length)
                 info["code_embeddings"] = code_embeddings.cpu().numpy()
+                info["mean_code_embedding"] = torch.mean(code_embeddings, dim=0, keepdim=True).cpu().numpy()
                 # Doc embeddings
                 tqdm.write(f"[*] Generating doc embeddings for {repo_name}")
                 doc_embeddings = self.generate_embeddings(repo_info["docs"], max_length)
                 info["doc_embeddings"] = doc_embeddings.cpu().numpy()
+                info["mean_doc_embedding"] = torch.mean(doc_embeddings, dim=0, keepdim=True).cpu().numpy()
                 # Requirement embeddings
                 tqdm.write(f"[*] Generating requirement embeddings for {repo_name}")
                 requirement_embeddings = self.generate_embeddings(repo_info["requirements"], max_length)
                 info["requirement_embeddings"] = requirement_embeddings.cpu().numpy()
+                info["mean_requirement_embedding"] = torch.mean(requirement_embeddings, dim=0,
+                                                                keepdim=True).cpu().numpy()
                 # Readme embeddings
                 tqdm.write(f"[*] Generating readme embeddings for {repo_name}")
                 readme_embeddings = self.generate_embeddings(repo_info["readmes"], max_length)
                 info["readme_embeddings"] = readme_embeddings.cpu().numpy()
+                info["mean_readme_embedding"] = torch.mean(readme_embeddings, dim=0, keepdim=True).cpu().numpy()
                 # Repo-level mean embedding
                 info["mean_repo_embedding"] = np.concatenate([
                     info["mean_doc_embedding"],
                     info["mean_requirement_embedding"],
                     info["mean_readme_embedding"]
+                ], axis=0).reshape(1, -1)
                 info["code_embeddings_shape"] = info["code_embeddings"].shape
+                info["mean_code_embedding_shape"] = info["mean_code_embedding"].shape
                 info["doc_embeddings_shape"] = info["doc_embeddings"].shape
+                info["mean_doc_embedding_shape"] = info["mean_doc_embedding"].shape
                 info["requirement_embeddings_shape"] = info["requirement_embeddings"].shape
+                info["mean_requirement_embedding_shape"] = info["mean_requirement_embedding"].shape
                 info["readme_embeddings_shape"] = info["readme_embeddings"].shape
+                info["mean_readme_embedding_shape"] = info["mean_readme_embedding"].shape
                 info["mean_repo_embedding_shape"] = info["mean_repo_embedding"].shape
                 progress_bar.update(1)