You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

20 lines
616 B

import fitz
doc = fitz.open(r'C:\git\spark-lesson\reference\sources\liu-discharge-transitions-thesis.pdf')
print(f'Total pages: {len(doc)}')
output = []
for i in range(len(doc)):
text = doc[i].get_text()
if text.strip():
output.append(f'=== PAGE {i+1} ===')
output.append(text)
full_text = '\n'.join(output)
with open(r'C:\git\spark-lesson\reference\sources\liu-discharge-transitions-thesis.txt', 'w', encoding='utf-8') as f:
f.write(full_text)
print(f'Extracted {len(output)//2} pages')
print(f'Total characters: {len(full_text)}')
print(f'Total lines: {full_text.count(chr(10))}')