File size: 3,748 Bytes
1bc7e54
 
 
 
ecfceb8
 
 
1bc7e54
ecfceb8
 
 
1bc7e54
 
 
ecfceb8
1bc7e54
 
 
 
 
ecfceb8
7837959
 
1bc7e54
ecfceb8
 
7837959
ecfceb8
eb5363b
1bc7e54
ecfceb8
1bc7e54
 
7837959
 
ecfceb8
1bc7e54
ecfceb8
1bc7e54
7837959
1bc7e54
 
7837959
ecfceb8
7837959
 
1bc7e54
ecfceb8
 
1bc7e54
 
 
ecfceb8
 
 
 
 
 
1bc7e54
 
 
ecfceb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7837959
 
 
ecfceb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bc7e54
 
 
 
 
 
 
 
7837959
 
ecfceb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "gte-qwen2-7b-instruct-m2v"
version = "0.1.0"
description = "Model2Vec distillation pipeline for gte-Qwen2-7B-instruct"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "accelerate>=1.7.0",
    "beam-client>=0.2.155",
    "boto3>=1.38.23",
    "datasets>=3.6.0",
    "dotenv>=0.9.9",
    "editables>=0.5",
    "einops>=0.8.1",
    "flash-attn>=2.7.4.post1",
    "hatchling>=1.27.0",
    "iso639>=0.1.4",
    "jinja2>=3.0.0",
    "joblib>=1.0.0",
    "kaleido==1.0.0rc13",
    "lightning>=2.5.1.post0",
    "matplotlib>=3.10.3",
    "more-itertools>=10.5.0",
    "mteb>=1.14.15",
    "numpy>=1.26.4",
    "plotly>=6.1.1",
    "psutil>=7.0.0",
    "pydantic>=2.11.5",
    "requests>=2.32.3",
    "rich>=10.0.0",
    "safetensors>=0.3.0",
    "scikit-learn>=1.6.1",
    "seaborn>=0.13.2",
    "sentence-transformers>=4.1.0",
    "setuptools>=80.8.0",
    "skops>=0.11.0",
    "smart-open[s3]>=7.1.0",
    "statsmodels>=0.14.4",
    "tokenizers>=0.20",
    "torch>=2.7.0",
    "transformers<=4.52.1",
    "tqdm>=4.65.0",
    "typer>=0.16.0",
]

[project.scripts]
distiller = "distiller.__main__:app"

[dependency-groups]
dev = [
    "mypy>=1.15.0",
    "ruff>=0.11.6",
]

[tool.hatch.build.targets.wheel]
packages = ["src/distiller"]

[tool.mypy]
exclude = [
    ".git",
    ".ruff_cache",
    ".venv",
    "venv",
    "__pycache__",
    "build",
    "dist",
    "vendor",
]
follow_untyped_imports = true

[tool.ruff]
line-length = 120
target-version = "py312"

# Exclude files/directories
exclude = [
    ".git",
    ".ruff_cache",
    ".venv",
    "venv",
    "__pycache__",
    "build",
    "dist",
    "vendor",
    "src/distiller/model2vec",
    "src/distiller/tokenlearn"
]

[tool.ruff.lint]
# Enable all rules by default, then selectively disable
select = ["ALL"]
ignore = [
    # Rules that conflict with other tools/preferences
    "D203",  # one-blank-line-before-class
    "D212",  # multi-line-summary-first-line
    "FBT001",  # Boolean positional arg in function definition (required for typer)
    "FBT002",  # Boolean default value in function definition (required for typer)
    "C901",  # function too complex
    "PLR0911",  # too many return statements
    "PLR0912",  # too many branches
    "PLR0913",  # too many arguments in function definition
    "PLR0915",  # too many statements
    "TRY300",  # Consider moving this statement to an `else` block
    "COM812",  # Use a constant for the message in a raise statement
    "TC001",  # Move application import into a type-checking block
    "ERA001", # Found commented-out code
    "G004", # Logging statement uses f-string
    "TD003", # Missing link in to-do
    "TRY301", # Abstract raise to an inner function
    # Disable rules that conflict with tab indentation
    "E101",  # Indentation contains mixed spaces and tabs
    "W191",  # indentation contains tabs
    "D206",  # indent with spaces, not tabs
    "PD901", # Avoid using the generic variable name `df` for DataFrames
    "ANN401", # Dynamically typed expressions (typing.Any) are disallowed
    "D103",   # Missing docstring in public function
    "BLE001", # Do not catch blind exception: `Exception`
    "T201", # Use `logger.info` instead of `print`
    "E501", # Line too long
    "PLR2004",
    "RUF001",
    "D100", # Missing docstring in public module
    "D101", # Missing docstring in public class
]

[tool.ruff.lint.mccabe]
max-complexity = 10

[tool.ruff.lint.pylint]
max-args = 5
max-branches = 12
max-statements = 50

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.format]
quote-style = "double"
indent-style = "tab"
skip-magic-trailing-comma = false
line-ending = "auto"