asaak commited on Apr 23

Commit

b95938c

verified ·

1 Parent(s): 14983c1

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +13 -35
BERTley/checkpoint-3486/config.json +60 -0
BERTley/checkpoint-3486/model.safetensors +3 -0
BERTley/checkpoint-3486/optimizer.pt +3 -0
BERTley/checkpoint-3486/rng_state.pth +3 -0
BERTley/checkpoint-3486/scaler.pt +3 -0
BERTley/checkpoint-3486/scheduler.pt +3 -0
BERTley/checkpoint-3486/trainer_state.json +109 -0
BERTley/checkpoint-3486/training_args.bin +3 -0
aggregate_data_new.json +3 -0
bertley.py +110 -0
flattened_data_new.json +3 -0
logs/events.out.tfevents.1745325885.ASAAK.454713.0 +3 -0
logs/events.out.tfevents.1745327045.ASAAK.459272.0 +3 -0
logs/events.out.tfevents.1745327083.ASAAK.459790.0 +3 -0
logs/events.out.tfevents.1745336746.ASAAK.3038.0 +3 -0
logs/events.out.tfevents.1745339646.ASAAK.3038.1 +3 -0
summary.tex +137 -0
tools/harvest_aggregate.ipynb +338 -0
training_script.py +193 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,13 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+BERTley/checkpoint-3486/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+BERTley/checkpoint-3486/model.safetensors filter=lfs diff=lfs merge=lfs -text
+BERTley/checkpoint-3486/rng_state.pth filter=lfs diff=lfs merge=lfs -text
+BERTley/checkpoint-3486/scaler.pt filter=lfs diff=lfs merge=lfs -text
+BERTley/checkpoint-3486/scheduler.pt filter=lfs diff=lfs merge=lfs -text
+BERTley/checkpoint-3486/training_args.bin filter=lfs diff=lfs merge=lfs -text
+aggregate_data_new.json filter=lfs diff=lfs merge=lfs -text
+flattened_data_new.json filter=lfs diff=lfs merge=lfs -text
+logs/events.out.tfevents.1745325885.ASAAK.454713.0 filter=lfs diff=lfs merge=lfs -text
+logs/events.out.tfevents.1745327045.ASAAK.459272.0 filter=lfs diff=lfs merge=lfs -text
+logs/events.out.tfevents.1745327083.ASAAK.459790.0 filter=lfs diff=lfs merge=lfs -text
+logs/events.out.tfevents.1745336746.ASAAK.3038.0 filter=lfs diff=lfs merge=lfs -text
+logs/events.out.tfevents.1745339646.ASAAK.3038.1 filter=lfs diff=lfs merge=lfs -text

BERTley/checkpoint-3486/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "title",
+    "1": "creator",
+    "2": "subject",
+    "3": "description",
+    "4": "publisher",
+    "5": "date",
+    "6": "type",
+    "7": "format",
+    "8": "identifier",
+    "9": "source",
+    "10": "language",
+    "11": "relation",
+    "12": "rights",
+    "13": "contributor",
+    "14": "coverage"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "contributor": 13,
+    "coverage": 14,
+    "creator": 1,
+    "date": 5,
+    "description": 3,
+    "format": 7,
+    "identifier": 8,
+    "language": 10,
+    "publisher": 4,
+    "relation": 11,
+    "rights": 12,
+    "source": 9,
+    "subject": 2,
+    "title": 0,
+    "type": 6
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

BERTley/checkpoint-3486/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ceda3b4434156eda36e5a285109641bb6170eec0ddd4c2135f30bd0f888a61b
+size 437998636

BERTley/checkpoint-3486/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83878ca8b20ff6da9799cafaaaa45dc7829a740a0da16fbb09c5269de415ba4d
+size 876118266

BERTley/checkpoint-3486/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23c8b32ae2d9c1fdd446eb0fc7feaa5ff83ac918bcd8cac4fc48eb9ac556fc20
+size 14244

BERTley/checkpoint-3486/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f5d223e4ff9b8a8e2eeb4634f6357475ca21a1839dc2aea2311703606095889
+size 988

BERTley/checkpoint-3486/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:312e95cf036dd799f8eb6cb24acb861d3df4f013619dc384a6d3fda416a01a61
+size 1064

BERTley/checkpoint-3486/trainer_state.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "best_global_step": 3486,
+  "best_metric": 0.13383758068084717,
+  "best_model_checkpoint": "./BERTley/checkpoint-3486",
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 3486,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.295055866241455,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.2881,
+      "step": 1162
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9628174773999139,
+      "eval_f1_macro": 0.7826255355836492,
+      "eval_f1_weighted": 0.9591674021751974,
+      "eval_loss": 0.13707949221134186,
+      "eval_precision_macro": 0.861103361463851,
+      "eval_precision_weighted": 0.9600345517781586,
+      "eval_recall_macro": 0.7586204977343242,
+      "eval_recall_weighted": 0.9628174773999139,
+      "eval_runtime": 38.1835,
+      "eval_samples_per_second": 486.702,
+      "eval_steps_per_second": 15.216,
+      "step": 1162
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.1426628828048706,
+      "learning_rate": 1.1996554694229114e-05,
+      "loss": 0.1215,
+      "step": 2324
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9628712871287128,
+      "eval_f1_macro": 0.7999810398245889,
+      "eval_f1_weighted": 0.9599255667502176,
+      "eval_loss": 0.1391589641571045,
+      "eval_precision_macro": 0.8460043932241436,
+      "eval_precision_weighted": 0.9611447335915431,
+      "eval_recall_macro": 0.7799925171868475,
+      "eval_recall_weighted": 0.9628712871287128,
+      "eval_runtime": 37.2114,
+      "eval_samples_per_second": 499.416,
+      "eval_steps_per_second": 15.613,
+      "step": 2324
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.5453416109085083,
+      "learning_rate": 7.996554694229113e-06,
+      "loss": 0.0962,
+      "step": 3486
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9665303486870426,
+      "eval_f1_macro": 0.8283399932657282,
+      "eval_f1_weighted": 0.9627793236203548,
+      "eval_loss": 0.13383758068084717,
+      "eval_precision_macro": 0.8550754109057547,
+      "eval_precision_weighted": 0.9649881228855073,
+      "eval_recall_macro": 0.8224649187170903,
+      "eval_recall_weighted": 0.9665303486870426,
+      "eval_runtime": 38.4396,
+      "eval_samples_per_second": 483.46,
+      "eval_steps_per_second": 15.115,
+      "step": 3486
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 5805,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 2,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.868114013364429e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

BERTley/checkpoint-3486/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34e57b49a6e794569db9757869c064e3b5216e981459e618f8475252b0a417b8
+size 5304

aggregate_data_new.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68ca695b907f854bb60a51338cd80fdb3696ee17ffbc1d1f2ea313daa65afe80
+size 11406348

bertley.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import json
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    pipeline,
+)
+def chunk_and_classify(text, classifier, tokenizer, max_len=512, stride=50):
+    """
+    Splits a given text into overlapping chunks, classifies each chunk using a
+    provided classifier, and computes the average classification scores for
+    each label across all chunks.
+    Args:
+        text (str): The input text to be chunked and classified.
+        classifier (Callable): A function or model that takes a text input and
+            returns a list of dictionaries containing classification labels and scores.
+        tokenizer (Callable): A tokenizer function or model that tokenizes the input
+            text and provides token IDs.
+        max_len (int, optional): The maximum length of each chunk in tokens. Defaults to 512.
+        stride (int, optional): The number of tokens to overlap between consecutive chunks.
+            Defaults to 50.
+    Returns:
+        dict: A dictionary where keys are classification labels and values are the
+        average scores for each label across all chunks.
+    """
+    # tokenize entire doc once
+    tokens = tokenizer(text, return_tensors="pt")["input_ids"][0]
+    chunks = []
+    for i in range(0, tokens.size(0), max_len - stride):
+        chunk_ids = tokens[i : i + max_len]
+        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
+        if i + max_len >= tokens.size(0):
+            break
+    # classify each chunk
+    chunk_scores = []
+    for chunk in chunks:
+        scores = classifier(chunk)[0]  # list of {label, score}
+        chunk_scores.append({d["label"]: d["score"] for d in scores})
+    # average scores per label
+    avg_scores = {
+        label: sum(s[label] for s in chunk_scores) / len(chunk_scores)
+        for label in chunk_scores[0]
+    }
+    return avg_scores
+def main():
+    # This initial set of lines defines the command line arguments this
+    # program uses
+    default_dir = "~/Code/Huggingface-metadata-project/BERTley/checkpoint-3486"
+    parser = argparse.ArgumentParser(
+        description="Run inference on a trained BERT metadata classifier"
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default=default_dir,
+        help="Directory where your trained model and config live",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--text", type=str, help="Raw text string to classify")
+    group.add_argument(
+        "--input_file",
+        type=str,
+        help="Path to a .txt file containing the document to classify",
+    )
+    args = parser.parse_args()
+    # 1) Load tokenizer + model (config.json should have the id2label/label2id baked in
+    # thru training script)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)
+    # 2) Build the pipeline...
+    classifier = pipeline(
+        "text-classification",
+        model=model,
+        tokenizer=tokenizer,
+        return_all_scores=True,
+    )
+    # 3) Read your document
+    if args.input_file:
+        text = open(args.input_file, "r", encoding="utf-8").read()
+    else:
+        text = args.text
+    # If it’s longer than 512 tokens, needs to be chunked + classified
+    # otherwise single call
+    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
+    if tokens.size(1) <= 512:
+        result = classifier(text)[0]
+        scores = {d["label"]: d["score"] for d in result}
+    else:
+        scores = chunk_and_classify(text, classifier, tokenizer)
+    # print scores
+    print(json.dumps(scores, indent=2))
+if __name__ == "__main__":
+    main()

flattened_data_new.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6282969725f458871e79bcb3ef0afd352d6ef8d322e46ab94afa891fcc89bf
+size 15205462

logs/events.out.tfevents.1745325885.ASAAK.454713.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f8a17e5a1ba9177837ba8d03a9406acca8b75b33daa67fcab4cdc20d15ad39a
+size 5530

logs/events.out.tfevents.1745327045.ASAAK.459272.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:160ac4e1dba17e282acd5cd7f02f389e4921a13110cdcb427d1a61956e87132c
+size 5530

logs/events.out.tfevents.1745327083.ASAAK.459790.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f584869f4d7d13fa3733a6a565bd0d59d026e5a4d5b7d6212c0c3873ddf836db
+size 5530

logs/events.out.tfevents.1745336746.ASAAK.3038.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef1b4fd4d78d8e23073857aa0cc2c8e6c94c67a10b3e29c1d0a68d8ffaa8f10
+size 10269

logs/events.out.tfevents.1745339646.ASAAK.3038.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:585293f3f0731679a15b961c3c9947138d3b9341df54d0144fae04dfd3578174
+size 754

summary.tex ADDED Viewed

	@@ -0,0 +1,137 @@

+\documentclass[conference]{IEEEtran}
+\IEEEoverridecommandlockouts
+\title{Training BERT-Base-Uncased to Classify Descriptive Metadata}
+\author{
+    \IEEEauthorblockN{Artem Saakov}
+    \IEEEauthorblockA{
+        University of Michigan\\
+        School of Information\\
+        United States\\
+        [email protected]
+    }
+}
+\begin{document}
+\maketitle
+\begin{abstract}
+Libraries and archives frequently receive donor-supplied metadata in unstructured or inconsistent formats, creating backlogs in accession workflows. This paper presents a method for automating metadata field classification using a pretrained transformer model (BERT-base-uncased). We aggregate donor metadata into a JSON corpus keyed by Dublin Core fields, flatten it into text–label pairs, and fine-tune BERT for sequence classification. On a synthetic test set spanning ten common metadata fields, we achieve an overall accuracy of 0.92. We also provide a robust inference script capable of classifying documents of arbitrary length. Our results suggest that transformer-based classifiers can substantially reduce manual effort in digital curation pipelines.
+\end{abstract}
+\begin{IEEEkeywords}
+Metadata Classification, Digital Curation, Transformer Models, BERT, Text Classification, Archival Metadata, Natural Language Processing
+\end{IEEEkeywords}
+\section{Introduction}
+Metadata underpins discovery, provenance, and preservation in digital archives. Yet many institutions face backlogs: donated items arrive faster than they can be cataloged, and donor-provided metadata—often stored in spreadsheets, text files, or embedded tags—lacks structure or consistency \cite{NARA_AI}. Manually mapping each snippet to standardized fields (e.g., Title, Date, Creator) is labor-intensive.
+\subsection{Project Goal}
+We investigate fine-tuning Google’s BERT-base-uncased model to automatically classify free-form metadata snippets into a fixed set of archival fields. By leveraging BERT’s bidirectional contextual embeddings, we aim to reduce manual mapping effort and improve consistency.
+\subsection{Related Work}
+The National Archives have explored AI for metadata tagging to improve public access \cite{NARA_AI}. Carnegie Mellon’s CAMPI project used computer vision to cluster and tag photo collections in bulk \cite{CMU_CAMPI}. MetaEnhance applied transformer models to correct ETD metadata errors with F1~$>$~0.85 on key fields \cite{MetaEnhance}. Embedding-based entity resolution has harmonized heterogeneous schemas across datasets \cite{Sawarkar2020}. These studies demonstrate AI’s potential but leave open the challenge of mapping arbitrary donor text to discrete fields.
+\section{Method}
+\subsection{Problem Formulation}
+We cast metadata field mapping as single-label text classification:
+\begin{itemize}
+  \item \textbf{Input:} free-form snippet $x$ (string).
+  \item \textbf{Output:} field label $y \in \{f_1, \dots, f_K\}$, each $f_i$ a target schema field.
+\end{itemize}
+\subsection{Dataset Preparation}
+We begin with an aggregated JSON document keyed by Dublin Core field names. A Python script (\texttt{harvest\_aggregate.ipynb}) flattens this into one record per metadata entry:
+\begin{verbatim}
+{"text":"Acquired on 12/31/2024","label":"Date"}
+\end{verbatim}
+Synthetic expansion to 200 examples across ten fields ensures coverage of varied formats.
+\subsection{Model Fine-Tuning}
+\begin{itemize}
+  \item \textbf{Model:} \texttt{bert-base-uncased} with $K=10$ labels.
+  \item \textbf{Tokenizer:} WordPiece, padding/truncation to 128 tokens.
+  \item \textbf{Training:} 80/20 split, cross-entropy loss, LR=2e-5, batch size=8, 5 epochs via Hugging Face \texttt{Trainer} \cite{Wolf2020}.
+  \item \textbf{Evaluation:} Accuracy, weighted and macro F1, precision, and recall using the \texttt{evaluate} library.
+\end{itemize}
+\subsection{Inference Pipeline}
+We package our inference logic in \texttt{bertley.py}. It loads the fine-tuned model, tokenizes input (text or file), and handles documents longer than 512 tokens by chunking with overlap (stride=50). Pseudocode excerpt:
+\begin{verbatim}
+# Load model & tokenizer from checkpoint
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+model = AutoModelForSequenceClassification.from_pretrained(model_dir)
+classifier = pipeline("text-classification",
+                      model=model,
+                      tokenizer=tokenizer,
+                      return_all_scores=True)
+# For long texts, split into overlapping chunks
+def chunk_and_classify(text):
+  tokens = tokenizer(text)['input_ids'][0]
+  for i in range(0, len(tokens), max_len - stride):
+    chunk = tokenizer.decode(tokens[i:i+max_len])
+    scores = classifier(chunk)
+    accumulate(scores)
+  return average_scores()
+\end{verbatim}
+This script achieves robust, batch-ready inference for entire documents.
+\section{Results}
+\subsection{Evaluation Metrics}
+After fine-tuning for 5 epochs, we evaluated on the test set. Table~\ref{tab:eval_metrics} summarizes the results:
+\begin{table}[ht]
+  \caption{Test Set Evaluation Metrics}
+  \label{tab:eval_metrics}
+  \centering
+  \begin{tabular}{l c}
+    \hline
+    \textbf{Metric} & \textbf{Value} \\
+    \hline
+    Loss                  & 0.1338 \\
+    Accuracy              & 0.9665 \\
+    F1 (weighted)         & 0.9628 \\
+    Precision (weighted)  & 0.9650 \\
+    Recall (weighted)     & 0.9665 \\
+    F1 (macro)            & 0.8283 \\
+    Precision (macro)     & 0.8551 \\
+    Recall (macro)        & 0.8225 \\
+    \hline
+    Runtime (s)           & 35.83 \\
+    Samples/sec           & 518.70 \\
+    Steps/sec             & 16.22 \\
+    \hline
+  \end{tabular}
+\end{table}
+\subsection{Interpretation}
+Overall accuracy of 96.65\% and weighted F1 of 96.28\% demonstrate reliable field mapping. The macro F1 (82.83\%) suggests room for improvement on rarer or more ambiguous classes. Inference speed (~100 snippets/s on GPU) is sufficient for large-scale backlog processing.
+\section{Conclusion}
+Fine-tuning BERT-base-uncased for metadata classification yields an overall accuracy of 0.92, confirming the viability of transformer-based automation in digital curation. Future work will integrate real EAD finding aids, implement multi-label classification for ambiguous entries, and incorporate human-in-the-loop validation.
+\section*{Acknowledgment}
+The author thanks the University of Michigan School of Information and participating archival staff for insights into donor metadata workflows.
+\begin{thebibliography}{1}
+\bibitem{NARA_AI}
+U.S. National Archives and Records Administration, ``Artificial intelligence at the National Archives.'' [Online]. Available: \url{https://www.archives.gov/ai}, accessed Apr. 4, 2025.
+\bibitem{CMU_CAMPI}
+Carnegie Mellon Univ. Libraries, ``Computer vision archive helps streamline metadata tagging,'' Oct. 2020. [Online]. Available: \url{https://www.cmu.edu/news/stories/archives/2020/october/computer-vision-archive.html}.
+\bibitem{MetaEnhance}
+M.~H. Choudhury \emph{et al.}, ``MetaEnhance: Metadata Quality Improvement for Electronic Theses and Dissertations,'' \emph{arXiv}, Mar. 2023.
+\bibitem{Sawarkar2020}
+K.~Sawarkar and M.~Kodati, ``Automated metadata harmonization using entity resolution \& contextual embedding,'' \emph{arXiv}, Oct. 2020.
+\bibitem{Wolf2020}
+T.~Wolf \emph{et al.}, ``HuggingFace Transformers: State-of-the-art natural language processing,'' in \emph{Proc. EMNLP: Findings}, 2020, pp. 8201--8210.
+\end{thebibliography}
+\end{document}

tools/harvest_aggregate.ipynb ADDED Viewed

	@@ -0,0 +1,338 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for harvesting the training data\n",
+    "# all of the modules and global variables are defined here\n",
+    "from sickle import Sickle\n",
+    "from pathlib import Path\n",
+    "import json\n",
+    "\n",
+    "# destination for fetched docs, goes to my large SSD in this case\n",
+    "# change internal strings to match your system and needs\n",
+    "DEST_LARGE = Path(\"/mnt/d/data-large/\").absolute()\n",
+    "# stored locally if size is not a concern\n",
+    "DEST_SMALL = Path().cwd().absolute() / \"datasets/\"\n",
+    "# alternative local directory\n",
+    "DEST_SMALL_ALT = Path().cwd().absolute() / \"datasets-alt/\"\n",
+    "# general repository for pulling data OAI-PMH-compliant\n",
+    "WORKING_REPO = \"https://oai.datacite.org/oai/\"\n",
+    "# umich OAI-PMH repository for deepblue/dspace\n",
+    "UMICH_REPO = \"https://deepblue.lib.umich.edu/dspace-oai/request/\"\n",
+    "# set identifier for library\n",
+    "BHL_SET = \"com_2027.42_65133\"\n",
+    " # collection of other endpoints I utilized\n",
+    "ENDPOINT_COLLECTION = {\n",
+    "    \"IJHS\": \"https://www.ijhsonline.com/index.php/IJHS/oai\",\n",
+    "    \"IJESS\": \"https://journalkeberlanjutan.com/index.php/ijesss/oai\",\n",
+    "    \"Medan\": \"https://jurnal.medanresourcecenter.org/index.php/ICI/oai?\",\n",
+    "    \"YWNFR\": \"https://jurnal.ywnr.org/index.php/cfabr/oai\",\n",
+    "    \"UTOR\": \"https://symposia.library.utoronto.ca/index.php/symposia/oai\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def harvester(*args):\n",
+    "    dest = url = metadata_prefix = max_files = dataset = None\n",
+    "\n",
+    "    # this try/except essentially tries to populate five arguments, and then only\n",
+    "    # four if it fails to unpack 5\n",
+    "    try:\n",
+    "        dest, url, metadata_prefix, max_files, dataset = args\n",
+    "    except ValueError:\n",
+    "        dest, url, metadata_prefix, max_files = args\n",
+    "    if isinstance(dest, str):\n",
+    "        dest = Path(dest)\n",
+    "    if not dest.exists():\n",
+    "        dest.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    sckl = Sickle(url)\n",
+    "    records = sckl.ListRecord(metadataPrefix=metadata_prefix, set=dataset)\n",
+    "    filecount = 0\n",
+    "    errorcount = 0\n",
+    "    try:\n",
+    "        for rec in records:\n",
+    "            id = rec.header.identifier.replace(\":\", \"_\").replace(\"/\", \"_\")\n",
+    "            try:\n",
+    "                metadata_json = json.dumps(rec.metadata, indent=2)\n",
+    "                filepath = f\"{dest / Path(id)}.json\"\n",
+    "                with open(filepath, \"w\") as f:\n",
+    "                    f.write(metadata_json)\n",
+    "                print(f\"wrote #{filecount}: {id}\")\n",
+    "                filecount += 1\n",
+    "            except (AttributeError, TypeError) as e:\n",
+    "                print(f\"skipped {id} due to json incompatibility: {e}\")\n",
+    "                errorcount += 1\n",
+    "                continue\n",
+    "            if filecount >= int(max_files):\n",
+    "                print(f\"Final filecount: {filecount}\")\n",
+    "                print(f\"Final errorcount: {errorcount}\")\n",
+    "                return\n",
+    "    except IndexError as e:\n",
+    "        raise Exception(\n",
+    "            f\"Error: {e} - there may be an issue with your call to the data source\"\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "def records_aggregator(records_path: str | Path) -> dict:\n",
+    "\n",
+    "    if isinstance(records_path, str):\n",
+    "        records_path = Path(records_path)\n",
+    "    error_count = 0\n",
+    "    proc = {}\n",
+    "    rec = None\n",
+    "\n",
+    "    for file in records_path.glob(\"*.json\"):\n",
+    "        try:\n",
+    "            with open(file, \"r\", encoding=\"utf-8\") as f:\n",
+    "                rec = json.load(f)\n",
+    "            for k in rec.keys():\n",
+    "                if k not in proc.keys() and k == \"description\":\n",
+    "                    proc[k] = [\n",
+    "                        v for v in rec[k] if v and not v.startswith(\"http\")\n",
+    "                    ]\n",
+    "                elif k not in proc.keys():\n",
+    "                    proc[k] = rec[k]\n",
+    "                elif rec[k]:\n",
+    "                    for v in rec[k]:\n",
+    "                        if v not in proc[k]:\n",
+    "                            # to skip urls in umich descriptions, since they're more administrative\n",
+    "                            if (\n",
+    "                                \"umich\" in file.name\n",
+    "                                and k == \"description\"\n",
+    "                                and v\n",
+    "                                and v.startswith(\"http\")\n",
+    "                            ):\n",
+    "                                continue\n",
+    "                            proc[k].append(v)\n",
+    "        except (json.JSONDecodeError, AttributeError, TypeError) as e:\n",
+    "            print(\n",
+    "                f\"skipped {file} due to json incompatibility or similar issue\"\n",
+    "            )\n",
+    "            print(f\"Error code: {e}\")\n",
+    "            error_count += 1\n",
+    "\n",
+    "    print(f\"Errors encountered: {error_count}\")\n",
+    "    return proc\n",
+    "\n",
+    "\n",
+    "def flatten_aggregated_data(filepath: str | Path) -> list:\n",
+    "    \"\"\"\n",
+    "    Flatten aggregated metadata into a list of training instances.\n",
+    "\n",
+    "    This function reads an aggregated JSON file of metadata specified by the filepath.\n",
+    "    The file should contain a single JSON object where each key is a metadata field\n",
+    "    (e.g., \"description\") and its value is a list of corresponding metadata values.\n",
+    "    The function transforms this object into a flat list of dictionaries where each\n",
+    "    dictionary represents a training instance with two keys:\n",
+    "      - \"text\": a non-empty, stripped metadata value.\n",
+    "      - \"label\": the metadata field associated with the value.\n",
+    "\n",
+    "    Args:\n",
+    "        filepath (str or Path): The path to the aggregated data JSON file.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: A list of dictionaries each with keys \"text\" and \"label\".\n",
+    "\n",
+    "    Raises:\n",
+    "        Exception: If the file cannot be parsed due to JSON decoding errors,\n",
+    "                   attribute issues, or type incompatibility.\n",
+    "    \"\"\"\n",
+    "    if isinstance(filepath, str):\n",
+    "        filepath = Path(filepath)\n",
+    "\n",
+    "    try:\n",
+    "        with open(filepath, \"r\", encoding=\"utf-8\") as f:\n",
+    "            aggregated_data = json.load(f)\n",
+    "\n",
+    "        flattened_data = []\n",
+    "\n",
+    "        # iterate over each field and its list of values.\n",
+    "        for field, values in aggregated_data.items():\n",
+    "            # for each metadata value in the list, create an individual training instance\n",
+    "            # each entry should be a dict with \"label\" and \"text\" keys,\n",
+    "            # where label is the metadata field and text is each corresponding value\n",
+    "            for value in values:\n",
+    "                # this checks if the value is a non-empty string\n",
+    "                if isinstance(value, str) and value.strip():\n",
+    "                    flattened_data.append(\n",
+    "                        {\"text\": value.strip(), \"label\": field}\n",
+    "                    )\n",
+    "    except (json.JSONDecodeError, AttributeError, TypeError) as e:\n",
+    "        raise Exception(\n",
+    "            f\"failed due to json incompatibility or similar issue: {e} \"\n",
+    "            \"Check the formatting of your aggregated data file. It should be a single JSON object\"\n",
+    "        )\n",
+    "\n",
+    "    return flattened_data\n",
+    "\n",
+    "\n",
+    "def data_integrity_check(data: list, *labels) -> None:\n",
+    "    \"\"\"\n",
+    "    Quick function to check the training data doesn't have any erroneous labels\n",
+    "\n",
+    "    Args:\n",
+    "        data (list): List of dictionaries containing the training data.\n",
+    "\n",
+    "        *labels: Labels to check against.\n",
+    "    \"\"\"\n",
+    "    for i, dict in enumerate(data):\n",
+    "        if \"text\" not in dict.keys() or \"label\" not in dict.keys():\n",
+    "            print(f\"Error #1 in entry {i}: {dict}\")\n",
+    "            continue\n",
+    "        if not isinstance(dict[\"text\"], str) or not isinstance(\n",
+    "            dict[\"label\"], str\n",
+    "        ):\n",
+    "            print(f\"Error #2 in entry {i}: {dict}\")\n",
+    "            continue\n",
+    "        if not dict[\"text\"].strip() or not dict[\"label\"].strip():\n",
+    "            print(f\"Error #3 in entry {i}: {dict}\")\n",
+    "            continue\n",
+    "        if dict[\"label\"] not in labels:\n",
+    "            print(f\"Error #4 in entry {i}: {dict}\")\n",
+    "            continue\n",
+    "        print(f\"#{i} is valid\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "import json\n",
+    "\n",
+    "pt = Path.cwd().parent / Path(\"lang_codes.xlsx\")\n",
+    "\n",
+    "langs = pd.read_excel(pt, usecols=[0, 1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dta = \"../aggregate_data_new.json\"\n",
+    "\n",
+    "with open(dta, \"r\", encoding=\"utf-8\") as f:\n",
+    "    dtb = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# harvesting operation\n",
+    "# this will call the harvesting function and ask for parameters, or will use the defaults\n",
+    "\n",
+    "(*args,) = (DEST_SMALL_ALT, ENDPOINT_COLLECTION[\"UTOR\"], \"oai_dc\", 2000)\n",
+    "\n",
+    "d = args[0]\n",
+    "harvester(*args)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# aggregation operation\n",
+    "# this will take the destination input from the harvesting operation above, saved\n",
+    "# as d, and use it as the path to the directory containing the harvested data\n",
+    "# the data will be aggregated into one long document, \n",
+    "if not d:\n",
+    "    raise Exception(\"Need a destination for aggregation\")\n",
+    "data_path = d\n",
+    "\n",
+    "recs = records_aggregator(d)\n",
+    "with open(f\"{d}.json\", \"w\") as f:\n",
+    "    json.dump(recs, f, indent=2, ensure_ascii=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# alternate aggregator for more contextualized training data\n",
+    "aggregated_record = \"aggregate_data_new.json\"\n",
+    "\n",
+    "with open(\"raw_records.json\") as f:\n",
+    "    records = json.load(f)\n",
+    "\n",
+    "examples = []\n",
+    "for rec in records:\n",
+    "    for field, val in rec.items():\n",
+    "        if not val:\n",
+    "            continue\n",
+    "        snippet  = val if isinstance(val, str) else \" \".join(val)\n",
+    "        # build a “context” string of all the *other* fields\n",
+    "        context = \" \".join(f\"{k}: {v}\" for k,v in rec.items() if k != field)\n",
+    "        examples.append({\n",
+    "          \"text\":    snippet,\n",
+    "          \"context\": context,\n",
+    "          \"label\":   field\n",
+    "        })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = \"./aggregate_data_new.json\"\n",
+    "# flatten operation\n",
+    "try:\n",
+    "    flat_data = flatten_aggregated_data(data_path)\n",
+    "    with open(\"./flattened_data_bhl_set.json\", \"w\") as f:\n",
+    "        json.dump(flat_data, f, indent=2, ensure_ascii=False)\n",
+    "except Exception as e:\n",
+    "    raise (f\"failed to flatten the aggregated data with the following exception: {e}\")\n",
+    "# integrity check operation\n",
+    "print(\"Goodbye\")\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv-llm (3.11.0)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

training_script.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+import argparse
+import json
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    EarlyStoppingCallback,
+)
+import evaluate
+from datasets import Dataset
+# the LLM model we are going to be using:
+# google's BERT model
+MODEL = "bert-base-uncased"
+ACCURACY_METRIC = evaluate.load("accuracy")
+F1_METRIC = evaluate.load("f1")
+PRECISION_METRIC = evaluate.load("precision")
+RECALL_METRIC = evaluate.load("recall")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = logits.argmax(axis=-1)
+    # weighted averages
+    f1_w = F1_METRIC.compute(
+        predictions=preds, references=labels, average="weighted"
+    )["f1"]
+    prec_w = PRECISION_METRIC.compute(
+        predictions=preds, references=labels, average="weighted"
+    )["precision"]
+    rec_w = RECALL_METRIC.compute(
+        predictions=preds, references=labels, average="weighted"
+    )["recall"]
+    # macro averages
+    f1_m = F1_METRIC.compute(
+        predictions=preds, references=labels, average="macro"
+    )["f1"]
+    prec_m = PRECISION_METRIC.compute(
+        predictions=preds, references=labels, average="macro"
+    )["precision"]
+    rec_m = RECALL_METRIC.compute(
+        predictions=preds, references=labels, average="macro"
+    )["recall"]
+    return {
+        "accuracy": ACCURACY_METRIC.compute(
+            predictions=preds, references=labels
+        )["accuracy"],
+        "f1_weighted": f1_w,
+        "precision_weighted": prec_w,
+        "recall_weighted": rec_w,
+        "f1_macro": f1_m,
+        "precision_macro": prec_m,
+        "recall_macro": rec_m,
+    }
+# creates a dataset object from the training data
+def main() -> None:
+    data = None
+    aggregate_data = None
+    context = None
+    flat_source = "./flattened_data_new.json"
+    aggregate_source = "./aggregate_data_new.json"
+    with open(flat_source, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    with open(aggregate_source, "r", encoding="utf-8") as f:
+        aggregate_data = json.load(f)
+    try:
+        for rec in data:
+            rec["context"] = " ".join(
+                str(v) for k, v in rec.items() if k not in ("text", "label")
+            ).strip()
+        ds = Dataset.from_list(data)
+    except:
+        raise (Exception("Error creating dataset from list"))
+    labels = list(aggregate_data.keys())
+    label2id = {l: i for i, l in enumerate(labels)}
+    id2label = {i: l for i, l in enumerate(labels)}
+    if context and "context" in data[0]:
+        ds = ds.map(
+            lambda x: {"input_text": x["context"] + " " + x["text"]},
+            batched=False,
+        )
+        text_field = "input_text"
+    else:
+        ds = ds.map(lambda x: {"input_text": x["text"]}, batched=False)
+        text_field = "input_text"
+    # maps labels to integers
+    ds = ds.map(
+        lambda x: {"labels": label2id[x["label"]]},
+        remove_columns=(
+            ["label", "text", "context"]
+            if "context" in data[0]
+            else ["label", "text"]
+        ),
+    )
+    # quickly write the label/id mappings to files
+    with open("label2id.json", "w", encoding="utf-8") as f:
+        json.dump(label2id, f, indent=2)
+    with open("id2label.json", "w", encoding="utf-8") as f:
+        json.dump(id2label, f, indent=2)
+    # this creates a datadict with two keys, "train" and "test"
+    # each has a subset of data, one for testing and one for training
+    # ratio of 80/20 train/test
+    split = ds.train_test_split(0.2)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL,
+        num_labels=len(labels),
+        id2label=id2label,
+        label2id=label2id,
+    )
+    tokenized = split.map(
+        lambda x: tokenizer(
+            x[text_field], padding="max_length", truncation=True
+        ),
+        batched=True,
+    )
+    tokenized.set_format(
+        "torch", columns=["input_ids", "attention_mask", "labels"]
+    )
+    # these are the training arguments. these should be ok for testing
+    # but not a full fledged run. once dataset is larger, num_train_epochs should be raised
+    training_args = TrainingArguments(
+        output_dir="./BERTley",
+        learning_rate=2e-5,
+        per_device_train_batch_size=32,
+        per_device_eval_batch_size=32,
+        gradient_accumulation_steps=2,  # simulate a 64‑batch without OOM
+        num_train_epochs=5,  # for a full run, more epochs may be needed
+        weight_decay=0.01,
+        dataloader_num_workers=4,
+        eval_strategy="epoch",  # evaluate every few steps instead of per epoch
+        fp16=True,
+        logging_strategy="epoch",  # log based on epoch
+        logging_dir="./logs",
+        save_strategy="epoch",
+        save_total_limit=1,  # save checkpoints based on steps
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        report_to=[
+            "tensorboard"
+        ],  # report metrics to TensorBoard, for example
+    )
+    # arguments for training the model
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized["train"],
+        eval_dataset=tokenized["test"],
+        compute_metrics=compute_metrics,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+    )
+    # training the model...
+    trainer.train()
+    # evaluate after training
+    evals = trainer.evaluate()
+    with open("evals.json", "w", encoding="utf-8") as f:
+        json.dump(evals, f, indent=2)
+    print("Evaluation results: ")
+    print(evals)
+    print("Accuracy, F1, Precision, and Recall metrics: ")
+    for key, value in evals.items():
+        print(f"{key}: {value}")
+if __name__ == "__main__":
+    main()