Ms Word Document Processing
import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.document_converter import DocumentConverter
def process_word_document(input_file: str, output_dir: str = "output") -> None:
"""
Process a Word document and extract its content and structure.
Args:
input_file (str): Path to the input Word document
output_dir (str): Directory to save processed outputs
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Setup input path
in_path = Path(input_file)
if not in_path.exists():
raise FileNotFoundError(f"Input file not found: {input_file}")
try:
# Initialize input document
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend
)
# Setup backend and convert
backend = MsWordDocumentBackend(in_doc=in_doc, path_or_stream=in_path)
doc = backend.convert()
# Extract and save text content
text_output = Path(output_dir) / f"{in_path.stem}_content.txt"
with open(text_output, 'w', encoding='utf-8') as f:
f.write(doc.get_text())
# Extract document structure
print(f"\nDocument Structure:")
print("-" * 20)
print(f"Title: {doc.title}")
print(f"Sections: {len(doc.sections)}")
print(f"Paragraphs: {len(doc.paragraphs)}")
# Process tables if any
if doc.tables:
print(f"\nTables found: {len(doc.tables)}")
tables_output = Path(output_dir) / f"{in_path.stem}_tables.txt"
with open(tables_output, 'w', encoding='utf-8') as f:
for i, table in enumerate(doc.tables, 1):
f.write(f"\nTable {i}:\n")
f.write(str(table))
f.write("\n" + "-"*50 + "\n")
print(f"\nProcessing complete! Outputs saved in: {output_dir}")
except Exception as e:
print(f"Error processing document: {str(e)}")
def main():
# Example usage
sample_doc = "word_sample.docx"
process_word_document(sample_doc)
if __name__ == "__main__":
main()
Word Document Processor
Key Features
- Creates a reusable function process_word_document() that handles Word document processing
- Includes proper error handling and file checks
- Creates an output directory for processed files
- Extracts and saves:
- Document text content
- Document structure information
- Tables (if present)
- Provides informative console output about the document structure
Usage Instructions
Setup Steps
- Save it to a file (e.g., word_processor.py)
- Make sure you have the docling library installed: pip install docling
- Place your Word document in the same directory (or provide the full path)
- Run the script: python word_processor.py
Output Files
The script creates an output directory containing:
- {filename}_content.txt: The extracted text content
- {filename}_tables.txt: Extracted tables (if any exist in the document)
Future Enhancements
- Support for other document formats
- More detailed content extraction
- Custom output formatting options
- Batch processing of multiple documents