Created AltTextEPUB class and implemented methods from AltText abstract class.
parent
d82184ceed
commit
7daaf8e206
|
@ -1,8 +1,12 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import typing
|
||||
import warnings
|
||||
import bs4
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
|
||||
class AltText(ABC):
|
||||
# PARSING METHODS
|
||||
@abstractmethod
|
||||
def checkData(self):
|
||||
pass
|
||||
|
@ -28,17 +32,27 @@ class AltText(ABC):
|
|||
def exportToFile(self, path:str):
|
||||
pass
|
||||
|
||||
def getSoup(content : str) -> bs4.BeautifulSoup:
|
||||
try:
|
||||
return bs4.BeautifulSoup(content, "html.parser")
|
||||
except Exception as htmlErr:
|
||||
try:
|
||||
return bs4.BeautifulSoup(content, features="xml")
|
||||
except Exception as xmlErr:
|
||||
raise Exception(f"Failed to parse the document as HTML: {htmlErr}\nFailed to parse the document as XML: {xmlErr}")
|
||||
|
||||
|
||||
class AltTextHTML(AltText):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
return None
|
||||
|
||||
def checkData(self):
|
||||
def checkData(self) -> bool:
|
||||
if not hasattr(self, "data"):
|
||||
raise Exception("no data set. please use .parse or .parseFile")
|
||||
return True
|
||||
|
||||
def parse(self, html:str) -> bs4.BeautifulSoup:
|
||||
soup = bs4.BeautifulSoup(html, "html.parser")
|
||||
soup = getSoup(html, "html.parser")
|
||||
self.data = soup
|
||||
return soup
|
||||
|
||||
|
@ -52,7 +66,6 @@ class AltTextHTML(AltText):
|
|||
return imgs
|
||||
|
||||
def getNoAltImgs(self) -> typing.List[bs4.element.Tag]:
|
||||
self.checkData()
|
||||
imgs = self.getAllImgs()
|
||||
noalt = []
|
||||
for img in imgs:
|
||||
|
@ -76,3 +89,61 @@ class AltTextHTML(AltText):
|
|||
with open(path, 'w', encoding='utf-8') as file:
|
||||
file.write(html)
|
||||
return path
|
||||
|
||||
class AltTextEPUB(AltText):
|
||||
def __init__(self) -> None:
|
||||
return None
|
||||
|
||||
def checkData(self) -> bool:
|
||||
if not hasattr(self, "data"):
|
||||
raise Exception("no data set. please use .parse or .parseFile")
|
||||
return True
|
||||
|
||||
def parse(self, html:str):
|
||||
raise Exception("parse: IMPLEMENT ME")
|
||||
|
||||
def parseFile(self, filename:str) -> epub.EpubBook:
|
||||
book = epub.read_epub(filename, {"ignore_ncx": True})
|
||||
self.data = book
|
||||
return book
|
||||
|
||||
def getAllImgs(self) -> typing.List[bs4.element.Tag]:
|
||||
documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT)
|
||||
imgs = []
|
||||
for docs in documents:
|
||||
# features="xml"
|
||||
soup = getSoup(docs.get_content())
|
||||
imgsInDoc = soup.find_all("img")
|
||||
for img in imgsInDoc:
|
||||
imgs.append(img)
|
||||
return imgs
|
||||
|
||||
def getNoAltImgs(self) -> typing.List[bs4.element.Tag]:
|
||||
imgs = self.getAllImgs()
|
||||
noalt = []
|
||||
for img in imgs:
|
||||
if not "alt" in img.attrs.keys() or img.attrs["alt"].strip() == "":
|
||||
noalt.append(img)
|
||||
return noalt
|
||||
|
||||
def setAlt(self, src:str, text:str):
|
||||
self.checkData()
|
||||
documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT)
|
||||
for doc in documents:
|
||||
soup = getSoup(doc.get_content())
|
||||
imgsInDoc = soup.find_all("img")
|
||||
for img in imgsInDoc:
|
||||
if img.attrs["src"] == src:
|
||||
img.attrs["alt"] = text
|
||||
newHtml = soup.prettify()
|
||||
doc.set_content(newHtml.encode('utf-8'))
|
||||
return
|
||||
raise Exception("unable to find image with src '{src}'".format(src=src))
|
||||
|
||||
def export(self):
|
||||
self.checkData()
|
||||
return self.data
|
||||
|
||||
def exportToFile(self, path:str) -> str:
|
||||
epub.write_epub(path, self.export())
|
||||
return path
|
|
@ -1,21 +1,44 @@
|
|||
import sys
|
||||
sys.path.append("../")
|
||||
import src.alttext_FEF.alttext as alttext
|
||||
import src.alttext.alttext as alttext
|
||||
import ebooklib
|
||||
|
||||
if __name__ == "__main__":
|
||||
HTML1 = "../books/pg71856-h/pg71856-images.html"
|
||||
HTML2 = "../books/pg71859-h/pg71859-images.html"
|
||||
|
||||
EPUB1 = "../books/pg71856-images-3.epub"
|
||||
EPUB2 = "../books/pg71908-images-3.epub"
|
||||
EPUB3 = "../books/seuss.epub"
|
||||
|
||||
def testHTML():
|
||||
## MAIN TEST
|
||||
print("TESTING index.py")
|
||||
alt:alttext.AltTextHTML = alttext.AltTextHTML()
|
||||
print("TESTING HTML")
|
||||
altHTML:alttext.AltTextHTML = alttext.AltTextHTML()
|
||||
|
||||
## PARSE TEST
|
||||
html = '<html><head><title>Test</title></head><body><img src="test"/><h1>Parse me!</h1><img src="test2"/></body></html>'
|
||||
alt.parse(html)
|
||||
altHTML.parse(html)
|
||||
|
||||
## PARSEFILE TEST
|
||||
path1 = "../books/pg71856-h/pg71856-images.html"
|
||||
path2 = "../books/pg71859-h/pg71859-images.html"
|
||||
alt.parseFile(path1)
|
||||
altHTML.parseFile(HTML1)
|
||||
|
||||
## GETALLIMGS & SETALT TEST
|
||||
imgs = alt.getNoAltImgs()
|
||||
print(imgs)
|
||||
imgs = altHTML.getNoAltImgs()
|
||||
print(imgs)
|
||||
|
||||
def testEPUB():
|
||||
print("TESTING EPUB")
|
||||
altEPUB:alttext.AltTextEPUB = alttext.AltTextEPUB()
|
||||
|
||||
altEPUB.parseFile(EPUB2)
|
||||
imgs = altEPUB.getNoAltImgs()
|
||||
print(imgs)
|
||||
# img = imgs[0]
|
||||
# print(img)
|
||||
# src = img.attrs["src"]
|
||||
# altEPUB.setAlt(src, "TEST ALT")
|
||||
# print(altEPUB.getAllImgs())
|
||||
|
||||
if __name__ == "__main__":
|
||||
# testHTML()
|
||||
testEPUB()
|
Loading…
Reference in New Issue