Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| # import re | |
| # import os | |
| # from xml.etree import ElementTree as ET | |
| # from xml.dom import minidom | |
| # def process_dorian_grey(): | |
| # # Create processed directory if it doesn't exist | |
| # os.makedirs('texts/processed', exist_ok=True) | |
| # # Read the file | |
| # with open('texts/dorian_grey.txt', 'r', encoding='utf-8') as f: | |
| # text = f.read() | |
| # # Create root XML element | |
| # root = ET.Element("book") | |
| # root.set("title", "The Picture of Dorian Gray") | |
| # # Split into chapters using regex | |
| # # Look for chapter markers and keep them with the content | |
| # chapter_pattern = r'(CHAPTER [IVXLC\d]+\..*?)(?=CHAPTER [IVXLC\d]+\.|$)' | |
| # chapters = re.findall(chapter_pattern, text, re.DOTALL) | |
| # # Process chapters | |
| # for i, content in enumerate(chapters): | |
| # # Create chapter element | |
| # chapter = ET.SubElement(root, "chapter") | |
| # chapter.set("id", f"chapter_{i}") | |
| # chapter.set("title", f"Chapter {i}") | |
| # chapter.text = content.strip() | |
| # # Pretty print XML | |
| # xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") | |
| # # Save as XML | |
| # output_path = 'texts/processed/dorian_grey.xml' | |
| # with open(output_path, 'w', encoding='utf-8') as f: | |
| # f.write(xml_str) | |
| # print(f"Processed and saved to {output_path}") | |
| # def process_time_machine(): | |
| # # Create processed directory if it doesn't exist | |
| # os.makedirs('texts/processed', exist_ok=True) | |
| # # Read the file | |
| # with open('texts/time_machine.txt', 'r', encoding='utf-8') as f: | |
| # text = f.read() | |
| # # Create root XML element | |
| # root = ET.Element("book") | |
| # root.set("title", "The Time Machine") | |
| # # Split into chapters using 4 or more newlines as separator | |
| # chapters = re.split(r'\n{4,}', text) | |
| # # Track actual chapter number (no skipping) | |
| # chapter_num = 1 | |
| # # Process chapters | |
| # for content in chapters: | |
| # if content.strip(): # Only process non-empty chapters | |
| # # Create chapter element | |
| # chapter = ET.SubElement(root, "chapter") | |
| # chapter.set("id", f"chapter_{chapter_num-1}") # Keep 0-based ids | |
| # chapter.set("title", f"Chapter {chapter_num}") | |
| # chapter.text = content.strip() | |
| # chapter_num += 1 | |
| # # Pretty print XML | |
| # xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") | |
| # # Save as XML | |
| # output_path = 'texts/processed/time_machine.xml' | |
| # with open(output_path, 'w', encoding='utf-8') as f: | |
| # f.write(xml_str) | |
| # print(f"Processed and saved to {output_path}") | |
| # if __name__ == "__main__": | |
| # process_time_machine() | |