WCNegentropy commited on
Commit
b9246a0
Β·
verified Β·
1 Parent(s): 2f39acc

πŸš€ OS Launch: Clean documentation and refined licensing

Browse files

This OS launch commit includes:

βœ… **Cleaned Documentation**
- Removed inflated claims and marketing language
- Added honest research status and limitations
- Created professional model card and validation reports
- Streamlined licensing to AGPLv3 + commercial contact

βœ… **Refined Codebase**
- Complete experimental bit-native transformer implementation
- 57 Python files with comprehensive research framework
- Safety telemetry and monitoring systems
- Distributed training and development tools

βœ… **Professional Standards**
- Empirical validation of all claims
- Clear experimental vs production distinctions
- Rigorous research methodology requirements
- Community contribution framework

Ready for serious research evaluation and academic investigation.

Files changed (1) hide show
  1. markov_spline_cli.py +307 -0
markov_spline_cli.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ MarkovSpline CLI Interface for BitTransformerLM Integration
4
+
5
+ Provides command-line tools for using MarkovSpline data smoothing
6
+ with BitTransformerLM training and inference pipelines.
7
+ """
8
+
9
+ import argparse
10
+ import sys
11
+ import os
12
+ import json
13
+ import numpy as np
14
+ import torch
15
+ from pathlib import Path
16
+ from typing import List, Dict, Any, Optional
17
+
18
+ # Add MarkovSpline to path
19
+ sys.path.insert(0, '/data/MarkovSpline')
20
+ from bitpipe_integration import MarkovSplineBitPipeModule, create_markov_spline_bitpipe_module
21
+ from core import SplineType
22
+
23
+ # Simple text to bits converter for CLI
24
+ class TextToBitsConverter:
25
+ """Simple text to bits converter."""
26
+
27
+ def text_to_bits(self, text, max_length=128):
28
+ """Convert text to bit sequence."""
29
+ bit_sequence = []
30
+ for char in text[:max_length//8]:
31
+ char_bits = format(ord(char), '08b')
32
+ bit_sequence.extend([int(b) for b in char_bits])
33
+
34
+ # Pad or truncate to max_length
35
+ if len(bit_sequence) < max_length:
36
+ bit_sequence.extend([0] * (max_length - len(bit_sequence)))
37
+ else:
38
+ bit_sequence = bit_sequence[:max_length]
39
+
40
+ return bit_sequence
41
+
42
+
43
+ class MarkovSplineBitTransformerCLI:
44
+ """CLI interface for MarkovSpline + BitTransformerLM integration."""
45
+
46
+ def __init__(self):
47
+ self.markov_module = None
48
+ self.text_converter = TextToBitsConverter()
49
+
50
+ def initialize_markov_spline(self, config: Optional[Dict] = None) -> bool:
51
+ """Initialize MarkovSpline module with configuration."""
52
+ try:
53
+ self.markov_module = create_markov_spline_bitpipe_module(config)
54
+ print(f"βœ… Initialized MarkovSpline module: {self.markov_module.module_name}")
55
+ return True
56
+ except Exception as e:
57
+ print(f"❌ Failed to initialize MarkovSpline: {e}")
58
+ return False
59
+
60
+ def preprocess_text_data(self,
61
+ input_file: str,
62
+ output_file: str,
63
+ smoothing_strength: float = 0.15,
64
+ chunk_size: int = 128) -> bool:
65
+ """Preprocess text data using MarkovSpline for BitTransformerLM training."""
66
+
67
+ if not self.markov_module:
68
+ print("❌ MarkovSpline module not initialized")
69
+ return False
70
+
71
+ try:
72
+ # Read input text
73
+ with open(input_file, 'r', encoding='utf-8') as f:
74
+ text_data = f.read().strip().split('\n')
75
+
76
+ print(f"πŸ“– Processing {len(text_data)} text samples...")
77
+
78
+ # Convert text to bit sequences
79
+ bit_sequences = []
80
+ for text in text_data:
81
+ if text.strip():
82
+ bits = self.text_converter.text_to_bits(text, max_length=chunk_size)
83
+ bit_sequences.append(bits)
84
+
85
+ print(f"πŸ”„ Converting to bit sequences: {len(bit_sequences)} sequences")
86
+
87
+ # Initialize MarkovSpline preprocessor
88
+ self.markov_module.initialize_application('data_preprocessor',
89
+ smoothing_strength=smoothing_strength,
90
+ preserve_features=True)
91
+
92
+ # Process bit sequences through MarkovSpline
93
+ result = self.markov_module.process_data(
94
+ bit_sequences,
95
+ 'preprocess_training',
96
+ binary_data=True
97
+ )
98
+
99
+ if not result['success']:
100
+ print(f"❌ Processing failed: {result.get('error', 'Unknown error')}")
101
+ return False
102
+
103
+ # Save processed sequences
104
+ processed_data = {
105
+ 'processed_sequences': result['processed_sequences'],
106
+ 'preprocessing_summary': result['preprocessing_summary'],
107
+ 'original_count': len(bit_sequences),
108
+ 'smoothing_strength': smoothing_strength,
109
+ 'chunk_size': chunk_size
110
+ }
111
+
112
+ with open(output_file, 'w') as f:
113
+ json.dump(processed_data, f, indent=2, default=str)
114
+
115
+ print(f"βœ… Preprocessed data saved to: {output_file}")
116
+ print(f"πŸ“Š Summary: {result['preprocessing_summary']}")
117
+ return True
118
+
119
+ except Exception as e:
120
+ print(f"❌ Preprocessing failed: {e}")
121
+ return False
122
+
123
+ def smooth_bit_sequence(self,
124
+ bit_sequence: List[int],
125
+ smoothing_type: str = 'predict_binary',
126
+ num_predictions: int = 10) -> Dict[str, Any]:
127
+ """Smooth/predict bit sequence using MarkovSpline."""
128
+
129
+ if not self.markov_module:
130
+ print("❌ MarkovSpline module not initialized")
131
+ return {'success': False, 'error': 'Module not initialized'}
132
+
133
+ try:
134
+ result = self.markov_module.process_data(
135
+ bit_sequence,
136
+ smoothing_type,
137
+ num_predictions=num_predictions
138
+ )
139
+ return result
140
+
141
+ except Exception as e:
142
+ print(f"❌ Bit sequence processing failed: {e}")
143
+ return {'success': False, 'error': str(e)}
144
+
145
+ def smooth_training_gradients(self,
146
+ gradient_file: str,
147
+ output_file: str,
148
+ learning_rate: float = 0.01,
149
+ smoothing_strength: float = 0.2) -> bool:
150
+ """Apply MarkovSpline gradient smoothing to BitTransformerLM training."""
151
+
152
+ if not self.markov_module:
153
+ print("❌ MarkovSpline module not initialized")
154
+ return False
155
+
156
+ try:
157
+ # Load gradient data (assuming PyTorch checkpoint format)
158
+ checkpoint = torch.load(gradient_file, map_location='cpu')
159
+
160
+ if 'gradients' not in checkpoint or 'parameters' not in checkpoint:
161
+ print("❌ Invalid gradient file format")
162
+ return False
163
+
164
+ # Initialize gradient smoother
165
+ self.markov_module.initialize_application('gradient_smoother',
166
+ learning_rate=learning_rate,
167
+ smoothing_strength=smoothing_strength)
168
+
169
+ # Process gradients
170
+ result = self.markov_module.process_data(
171
+ {
172
+ 'parameters': checkpoint['parameters'],
173
+ 'gradients': checkpoint['gradients']
174
+ },
175
+ 'smooth_gradients'
176
+ )
177
+
178
+ if not result['success']:
179
+ print(f"❌ Gradient smoothing failed: {result.get('error', 'Unknown error')}")
180
+ return False
181
+
182
+ # Save smoothed parameters
183
+ smoothed_checkpoint = {
184
+ 'smoothed_parameters': result['smoothed_parameters'],
185
+ 'optimization_metrics': result['optimization_metrics'],
186
+ 'original_gradients': checkpoint['gradients']
187
+ }
188
+
189
+ torch.save(smoothed_checkpoint, output_file)
190
+ print(f"βœ… Smoothed gradients saved to: {output_file}")
191
+ print(f"πŸ“Š Optimization metrics: {result['optimization_metrics']}")
192
+ return True
193
+
194
+ except Exception as e:
195
+ print(f"❌ Gradient smoothing failed: {e}")
196
+ return False
197
+
198
+ def create_smoothed_dataset(self,
199
+ input_dataset: str,
200
+ output_dataset: str,
201
+ config: Optional[Dict] = None) -> bool:
202
+ """Create smoothed dataset for BitTransformerLM training."""
203
+
204
+ # Default configuration for dataset smoothing
205
+ default_config = {
206
+ 'smoothing_strength': 0.1,
207
+ 'num_states': 20,
208
+ 'spline_type': 'cubic',
209
+ 'preserve_features': True
210
+ }
211
+
212
+ if config:
213
+ default_config.update(config)
214
+
215
+ if not self.markov_module:
216
+ self.initialize_markov_spline(default_config)
217
+
218
+ return self.preprocess_text_data(input_dataset, output_dataset,
219
+ default_config['smoothing_strength'])
220
+
221
+
222
+ def main():
223
+ parser = argparse.ArgumentParser(description='MarkovSpline CLI for BitTransformerLM')
224
+ parser.add_argument('command', choices=['preprocess', 'smooth-gradients', 'create-dataset', 'predict-bits'],
225
+ help='Command to execute')
226
+
227
+ # Common arguments
228
+ parser.add_argument('--input', '-i', required=True, help='Input file path')
229
+ parser.add_argument('--output', '-o', required=True, help='Output file path')
230
+ parser.add_argument('--config', '-c', help='Configuration JSON file')
231
+
232
+ # Preprocessing arguments
233
+ parser.add_argument('--smoothing-strength', type=float, default=0.15,
234
+ help='Smoothing strength (0.0-1.0)')
235
+ parser.add_argument('--chunk-size', type=int, default=128,
236
+ help='Text chunk size for bit conversion')
237
+
238
+ # Gradient smoothing arguments
239
+ parser.add_argument('--learning-rate', type=float, default=0.01,
240
+ help='Learning rate for gradient smoothing')
241
+
242
+ # Bit prediction arguments
243
+ parser.add_argument('--num-predictions', type=int, default=10,
244
+ help='Number of bit predictions to generate')
245
+
246
+ args = parser.parse_args()
247
+
248
+ # Load configuration if provided
249
+ config = None
250
+ if args.config:
251
+ try:
252
+ with open(args.config, 'r') as f:
253
+ config = json.load(f)
254
+ except Exception as e:
255
+ print(f"❌ Failed to load config: {e}")
256
+ return 1
257
+
258
+ # Initialize CLI
259
+ cli = MarkovSplineBitTransformerCLI()
260
+ if not cli.initialize_markov_spline(config):
261
+ return 1
262
+
263
+ # Execute command
264
+ success = False
265
+
266
+ if args.command == 'preprocess':
267
+ success = cli.preprocess_text_data(
268
+ args.input, args.output,
269
+ args.smoothing_strength, args.chunk_size
270
+ )
271
+
272
+ elif args.command == 'smooth-gradients':
273
+ success = cli.smooth_training_gradients(
274
+ args.input, args.output,
275
+ args.learning_rate, args.smoothing_strength
276
+ )
277
+
278
+ elif args.command == 'create-dataset':
279
+ success = cli.create_smoothed_dataset(
280
+ args.input, args.output, config
281
+ )
282
+
283
+ elif args.command == 'predict-bits':
284
+ # Read bit sequence from input file
285
+ try:
286
+ with open(args.input, 'r') as f:
287
+ bit_data = json.load(f)
288
+ bit_sequence = bit_data.get('bits', [])
289
+
290
+ result = cli.smooth_bit_sequence(bit_sequence, 'predict_binary', args.num_predictions)
291
+
292
+ if result['success']:
293
+ with open(args.output, 'w') as f:
294
+ json.dump(result, f, indent=2, default=str)
295
+ print(f"βœ… Bit predictions saved to: {args.output}")
296
+ success = True
297
+ else:
298
+ print(f"❌ Bit prediction failed: {result.get('error', 'Unknown error')}")
299
+
300
+ except Exception as e:
301
+ print(f"❌ Bit prediction failed: {e}")
302
+
303
+ return 0 if success else 1
304
+
305
+
306
+ if __name__ == '__main__':
307
+ sys.exit(main())