You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

27 lines
799 B

"""Extract text content from PDF to plain text file."""
import sys
import fitz
src = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.pdf'
dst = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.txt'
doc = fitz.open(src)
print(f'Pages: {len(doc)}')
print(f'Title: {doc.metadata.get("title", "N/A")}')
print(f'Author: {doc.metadata.get("author", "N/A")}')
text = []
for i, page in enumerate(doc):
t = page.get_text()
if t.strip():
text.append(f'--- Page {i+1} ---\n{t}')
full = '\n'.join(text)
print(f'Total chars: {len(full):,}')
print(f'Estimated size: {len(full.encode("utf-8"))/1024/1024:.1f} MB')
with open(dst, 'w', encoding='utf-8') as f:
f.write(full)
print(f'Written to {dst}')
doc.close()