Spaces:
Sleeping
Sleeping
File size: 3,720 Bytes
33dc3b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "en-core-web-sm",
# "spacy",
# ]
#
# [tool.uv.sources]
# en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
# ///
#!/usr/bin/env python3
import argparse
import csv
import re
import sys
from pathlib import Path
import spacy
def main():
parser = argparse.ArgumentParser(
description="Tokenize text files and output chunked CSV"
)
parser.add_argument("files", nargs="+", help="Input text file(s) to process")
parser.add_argument(
"-n",
"--tokens",
type=int,
default=100,
help="Number of tokens per chunk (default: 100)",
)
parser.add_argument(
"-l",
"--label",
type=str,
help="Custom label for all chunks (defaults to filename)",
)
parser.add_argument(
"-o",
"--output",
type=str,
default="output.csv",
help="Output CSV filename (default: output.csv)",
)
parser.add_argument(
"-c",
"--max-chunks",
type=int,
help="Maximum number of chunks to output (default: unlimited)",
)
parser.add_argument(
"--lemma",
action="store_true",
help="Use lemmatized forms of tokens instead of original text",
)
args = parser.parse_args()
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
# Process files and collect chunks
all_chunks = []
chunks_created = 0
for filename in args.files:
if args.max_chunks and chunks_created >= args.max_chunks:
break
filepath = Path(filename)
if not filepath.exists():
print(f"Warning: File '{filename}' not found, skipping...")
continue
try:
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
except Exception as e:
print(f"Error reading '{filename}': {e}")
continue
# Split on one or more newlines
segments = re.split(r"\n+", text)
# Remove empty segments
segments = [seg.strip() for seg in segments if seg.strip()]
# Process segments through spaCy pipe
all_tokens = []
for doc in nlp.pipe(segments):
# Extract tokens from each processed segment
if args.lemma:
tokens = [token.lemma_ for token in doc]
else:
tokens = [token.text for token in doc]
all_tokens.extend(tokens)
# Determine label
label = args.label if args.label else filepath.name
# Create chunks of n tokens
for i in range(0, len(all_tokens), args.tokens):
if args.max_chunks and chunks_created >= args.max_chunks:
break
chunk = all_tokens[i : i + args.tokens]
# Only include chunks with exactly n tokens
if len(chunk) == args.tokens:
chunk_text = " ".join(chunk)
all_chunks.append({"text": chunk_text, "label": label})
chunks_created += 1
# Write to CSV
if all_chunks:
with open(args.output, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=["text", "label"])
writer.writeheader()
writer.writerows(all_chunks)
print(f"Successfully wrote {len(all_chunks)} chunks to '{args.output}'")
if args.lemma:
print("Note: Tokens were lemmatized")
else:
print("No valid chunks to write.")
if __name__ == "__main__":
main()
|