still not workin

pull/19/head
kethan351 2024-03-20 17:21:21 -04:00
parent bedb524546
commit 0d47c0c4f3
2 changed files with 55 additions and 18 deletions

View File

@ -1,15 +1,15 @@
from abc import ABC, abstractmethod
import typing
from threading import Thread
import time
import bs4
import ebooklib
from ebooklib import epub
from descengine import DescEngine
from ocrengine import OCREngine
from langengine import LangEngine
from .descengine.descengine import DescEngine
from .ocrengine.ocrengine import OCREngine
from .langengine.langengine import LangEngine
DEFOPTIONS = {
@ -84,7 +84,7 @@ class AltText(ABC):
# PARSING METHODS
@abstractmethod
def parse(self, data: str) -> bs4.BeautifulSoup | epub.EpubBook:
def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]:
"""Parses data into a BeautifulSoup or EpubBook object.
Args:
@ -96,7 +96,7 @@ class AltText(ABC):
pass
@abstractmethod
def parseFile(self, filepath: str) -> bs4.BeautifulSoup | epub.EpubBook:
def parseFile(self, filepath: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]:
"""Parses data from a file into a BeautifulSoup or EpubBook object.
Args:
@ -163,7 +163,7 @@ class AltText(ABC):
pass
@abstractmethod
def export(self) -> str | epub.EpubBook:
def export(self) -> typing.Union[str, epub.EpubBook]:
"""Exports the current data.
Returns:

View File

@ -11,8 +11,8 @@ from bs4 import BeautifulSoup
import importlib
sys.path.append("c:/Users/ketha/Code/Senior D") #This will need to be changed system to system
AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTML
PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine").PrivateGPT
descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine.py'
PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine.langengine").PrivateGPT
descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine/descengine.py'
@ -24,6 +24,8 @@ descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine.
# iterate thru downloaded_books folder, pass html into parseFile
class AltTextGenerator(AltTextHTML):
# uses the class from alttext.py
# adds relevant benchmarking and saving methods
@ -34,18 +36,27 @@ class AltTextGenerator(AltTextHTML):
self.api_key = api_key
#Use genAltTextV2
#ADD benchmark time stamps
def genAltTextV2(self, src: str) -> str:
def genAltTextV2(self, src: str, book_id, image_path, book_path) -> str:
# Start total timing
total_start_time = time.time()
with open('example.txt', 'w', encoding="utf-8") as file:
#contents = file.read()
file.write(str(src))
# Image data extraction timing
imgdata_start_time = time.time()
print("starting imaging")
time.sleep(3)
imgdata = self.getImgData(src)
imgdata_end_time = time.time()
imgdata_total_time = imgdata_end_time - imgdata_start_time
# Context extraction timing
context = [None, None]
print("starting contexting")
time.sleep(3)
context_start_time = time.time()
if self.options["withContext"]:
context = self.getContext(self.getImg(src))
@ -56,12 +67,16 @@ class AltTextGenerator(AltTextHTML):
# Description generation timing
genDesc_start_time = time.time()
print("starting desc")
time.sleep(3)
desc = self.genDesc(imgdata, src, context)
genDesc_end_time = time.time()
genDesc_total_time = genDesc_end_time - genDesc_start_time
# OCR processing timing
ocr_start_time = time.time()
print("starting ocr")
time.sleep(3)
chars = ""
if self.ocrEngine is not None:
chars = self.genChars(imgdata, src).strip()
@ -70,6 +85,8 @@ class AltTextGenerator(AltTextHTML):
# Refinement processing timing
refine_start_time = time.time()
print("starting refinement")
time.sleep(3)
if self.langEngine is None:
raise Exception("To use version 2, you must have a langEngine set.")
refined_desc = self.langEngine.refineAlt(desc, chars, context, None)
@ -80,14 +97,26 @@ class AltTextGenerator(AltTextHTML):
total_end_time = time.time()
total_overall_time = total_end_time - total_start_time
#Record dictionary to store all the timing data
record = {
"Image Data Extraction Time": imgdata_total_time,
"Context Extraction Time": context_total_time,
"Description Generation Time": genDesc_total_time,
"OCR Processing Time": ocr_total_time,
"Refinement Processing Time": refine_total_time,
"Total Overall Time": total_overall_time
"Book": book_id,
"Image": image_path,
"Path": book_path,
"Status": True, #Set false if failed, set true is worked
"Before Context": beforeContext,
"After Context": afterContext,
"genDesc": desc,
"genDesc-Start": genDesc_start_time,
"genDesc-End": genDesc_end_time,
"genDesc-Time": genDesc_total_time,
"genOCR": chars,
"genOCR-Start": ocr_start_time,
"genOCR-End": ocr_end_time,
"genOCR-Time": ocr_total_time,
"refineDesc": refined_desc,
"refineDesc-Time": refine_total_time,
"Total Time": total_overall_time
}
# Add record to benchmark_records for later CSV generation
self.benchmark_records.append(record)
@ -132,6 +161,7 @@ def automate_process(extr_folder : str):
if os.path.exists(extr_folder):
for book_id in os.listdir(extr_folder):
book_path = os.path.join(extr_folder, book_id)
#alt-text/tests/downloaded_books\120
if os.path.isdir(book_path):
for filename in os.listdir(book_path):
filepath = os.path.join(book_path, filename)
@ -139,13 +169,20 @@ def automate_process(extr_folder : str):
# Check if the file is an HTML file
if filepath.endswith(".html"):
#extra layer should: add an extra layer to iterate through the images tab,
#find that image within the .html
#Go to alt-text generation where it will...
#get the context
#generate the alt-text for that image based on the context and other factors
# Use the parseFile method to parse the HTML file for the genAltText function
soup = generator.parseFile(filepath)
generator.genAltText(soup)
generator.genAltTextV2(soup, book_id, filepath, book_path)
generator.generate_csv('test_benchmark.csv', generator.benchmark_records)
if __name__ == "__main__":
print("Running automate.py")
automate_process('downloaded_books')
automate_process('alt-text/tests/downloaded_books')