Created AltTextEPUB class and implemented methods from AltText abstract class.

pull/5/head
XxMistaCruzxX 2023-10-26 14:40:06 -04:00
parent d82184ceed
commit 7daaf8e206
2 changed files with 108 additions and 14 deletions

View File

@ -1,8 +1,12 @@
from abc import ABC, abstractmethod
import typing
import warnings
import bs4
import ebooklib
from ebooklib import epub
class AltText(ABC):
# PARSING METHODS
@abstractmethod
def checkData(self):
pass
@ -28,17 +32,27 @@ class AltText(ABC):
def exportToFile(self, path:str):
pass
def getSoup(content : str) -> bs4.BeautifulSoup:
try:
return bs4.BeautifulSoup(content, "html.parser")
except Exception as htmlErr:
try:
return bs4.BeautifulSoup(content, features="xml")
except Exception as xmlErr:
raise Exception(f"Failed to parse the document as HTML: {htmlErr}\nFailed to parse the document as XML: {xmlErr}")
class AltTextHTML(AltText):
def __init__(self):
def __init__(self) -> None:
return None
def checkData(self):
def checkData(self) -> bool:
if not hasattr(self, "data"):
raise Exception("no data set. please use .parse or .parseFile")
return True
def parse(self, html:str) -> bs4.BeautifulSoup:
soup = bs4.BeautifulSoup(html, "html.parser")
soup = getSoup(html, "html.parser")
self.data = soup
return soup
@ -52,7 +66,6 @@ class AltTextHTML(AltText):
return imgs
def getNoAltImgs(self) -> typing.List[bs4.element.Tag]:
self.checkData()
imgs = self.getAllImgs()
noalt = []
for img in imgs:
@ -76,3 +89,61 @@ class AltTextHTML(AltText):
with open(path, 'w', encoding='utf-8') as file:
file.write(html)
return path
class AltTextEPUB(AltText):
def __init__(self) -> None:
return None
def checkData(self) -> bool:
if not hasattr(self, "data"):
raise Exception("no data set. please use .parse or .parseFile")
return True
def parse(self, html:str):
raise Exception("parse: IMPLEMENT ME")
def parseFile(self, filename:str) -> epub.EpubBook:
book = epub.read_epub(filename, {"ignore_ncx": True})
self.data = book
return book
def getAllImgs(self) -> typing.List[bs4.element.Tag]:
documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT)
imgs = []
for docs in documents:
# features="xml"
soup = getSoup(docs.get_content())
imgsInDoc = soup.find_all("img")
for img in imgsInDoc:
imgs.append(img)
return imgs
def getNoAltImgs(self) -> typing.List[bs4.element.Tag]:
imgs = self.getAllImgs()
noalt = []
for img in imgs:
if not "alt" in img.attrs.keys() or img.attrs["alt"].strip() == "":
noalt.append(img)
return noalt
def setAlt(self, src:str, text:str):
self.checkData()
documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT)
for doc in documents:
soup = getSoup(doc.get_content())
imgsInDoc = soup.find_all("img")
for img in imgsInDoc:
if img.attrs["src"] == src:
img.attrs["alt"] = text
newHtml = soup.prettify()
doc.set_content(newHtml.encode('utf-8'))
return
raise Exception("unable to find image with src '{src}'".format(src=src))
def export(self):
self.checkData()
return self.data
def exportToFile(self, path:str) -> str:
epub.write_epub(path, self.export())
return path

View File

@ -1,21 +1,44 @@
import sys
sys.path.append("../")
import src.alttext_FEF.alttext as alttext
import src.alttext.alttext as alttext
import ebooklib
if __name__ == "__main__":
HTML1 = "../books/pg71856-h/pg71856-images.html"
HTML2 = "../books/pg71859-h/pg71859-images.html"
EPUB1 = "../books/pg71856-images-3.epub"
EPUB2 = "../books/pg71908-images-3.epub"
EPUB3 = "../books/seuss.epub"
def testHTML():
## MAIN TEST
print("TESTING index.py")
alt:alttext.AltTextHTML = alttext.AltTextHTML()
print("TESTING HTML")
altHTML:alttext.AltTextHTML = alttext.AltTextHTML()
## PARSE TEST
html = '<html><head><title>Test</title></head><body><img src="test"/><h1>Parse me!</h1><img src="test2"/></body></html>'
alt.parse(html)
altHTML.parse(html)
## PARSEFILE TEST
path1 = "../books/pg71856-h/pg71856-images.html"
path2 = "../books/pg71859-h/pg71859-images.html"
alt.parseFile(path1)
altHTML.parseFile(HTML1)
## GETALLIMGS & SETALT TEST
imgs = alt.getNoAltImgs()
print(imgs)
imgs = altHTML.getNoAltImgs()
print(imgs)
def testEPUB():
print("TESTING EPUB")
altEPUB:alttext.AltTextEPUB = alttext.AltTextEPUB()
altEPUB.parseFile(EPUB2)
imgs = altEPUB.getNoAltImgs()
print(imgs)
# img = imgs[0]
# print(img)
# src = img.attrs["src"]
# altEPUB.setAlt(src, "TEST ALT")
# print(altEPUB.getAllImgs())
if __name__ == "__main__":
# testHTML()
testEPUB()