I need to get text from an epub
from epub_conversion.utils import open_book, convert_epub_to_lines
f = open("demofile.txt", "a")
book = open_book("razvansividra.epub")
lines = convert_epub_to_lines(book)
I use this but if I use print(lines) it does print only one line. And the library is 6 years old. Do you guys know a good way ?
What about https://github.com/aerkalov/ebooklib
EbookLib is a Python library for managing EPUB2/EPUB3 and Kindle files. It's capable of reading and writing EPUB files programmatically (Kindle support is under development).
The API is designed to be as simple as possible, while at the same time making complex things possible too. It has support for covers, table of contents, spine, guide, metadata and etc.
import ebooklib
from ebooklib import epub
book = epub.read_epub('test.epub')
for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
print doc
Here is a sloppy script that extracts the text from an .epub in the right order. Improvements could be made
Quick explanation:
Dependency: lxml
#!/usr/bin/python3
import shutil, os, sys, zipfile, tempfile
from lxml import etree
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <input.epub> <output.txt>")
exit(1)
inputFilePath=sys.argv[1]
outputFilePath=sys.argv[2]
print(f"Input: {inputFilePath}")
print(f"Output: {outputFilePath}")
with tempfile.TemporaryDirectory() as tmpDir:
print(f"Extracting input to temp directory '{tmpDir}'.")
with zipfile.ZipFile(inputFilePath, 'r') as zip_ref:
zip_ref.extractall(tmpDir)
with open(outputFilePath, "w") as outFile:
print(f"Parsing 'container.xml' file.")
containerFilePath=f"{tmpDir}/META-INF/container.xml"
tree = etree.parse(containerFilePath)
for rootFilePath in tree.xpath( "//*[local-name()='container']"
"/*[local-name()='rootfiles']"
"/*[local-name()='rootfile']"
"/@full-path"):
print(f"Parsing '{rootFilePath}' file.")
contentFilePath = f"{tmpDir}/{rootFilePath}"
contentFileDirPath = os.path.dirname(contentFilePath)
tree = etree.parse(contentFilePath)
for idref in tree.xpath("//*[local-name()='package']"
"/*[local-name()='spine']"
"/*[local-name()='itemref']"
"/@idref"):
for href in tree.xpath( f"//*[local-name()='package']"
f"/*[local-name()='manifest']"
f"/*[local-name()='item'][@id='{idref}']"
f"/@href"):
outFile.write("\n")
xhtmlFilePath = f"{contentFileDirPath}/{href}"
subtree = etree.parse(xhtmlFilePath, etree.HTMLParser())
for ptag in subtree.xpath("//html/body/*"):
for text in ptag.itertext():
outFile.write(f"{text}")
outFile.write("\n")
print(f"Text written to '{outputFilePath}'.")
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With