Integrated current alt-text package structure into benchmarking. Completed benchmarking functionality.

2024-03-21 22:52:53 -04:00 · 2024-03-21 22:52:53 -04:00 · 15d5275599
parent 0102c8ab84 bdac53a646
commit 15d5275599
9 changed files with 253 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,5 +4,10 @@
 /books
 /tests/outputs

+**/empty_alt_text.txt
+**/empty_alt_text_sample.txt
+**/book_outputs
+**/downloaded_books
+
 **/keys.py
 **/vertex-key.json
--- a/src/alttext/alttext.py
+++ b/src/alttext/alttext.py
@ -83,7 +83,7 @@ class AltText(ABC):

    # PARSING METHODS
    @abstractmethod
-    def parse(self, data: str) -> bs4.BeautifulSoup | epub.EpubBook:
+    def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]:
        """Parses data into a BeautifulSoup or EpubBook object.

        Args:
@ -95,7 +95,9 @@ class AltText(ABC):
        pass

    @abstractmethod
-    def parseFile(self, filepath: str) -> bs4.BeautifulSoup | epub.EpubBook:
+    def parseFile(
+        self, filepath: str
+    ) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]:
        """Parses data from a file into a BeautifulSoup or EpubBook object.

        Args:
@ -162,7 +164,7 @@ class AltText(ABC):
        pass

    @abstractmethod
-    def export(self) -> str | epub.EpubBook:
+    def export(self) -> typing.Union[str, epub.EpubBook]:
        """Exports the current data.

        Returns:
--- a/src/alttext/descengine/descengine.py
+++ b/src/alttext/descengine/descengine.py
@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod

+
 ### DESCENGINE CLASSES
 class DescEngine(ABC):
    @abstractmethod
--- a/src/alttext/langengine/langengine.py
+++ b/src/alttext/langengine/langengine.py
@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod

+
 class LangEngine(ABC):
    @abstractmethod
    def _completion(self, prompt: str) -> str:
@ -99,4 +100,4 @@ class LangEngine(ABC):
        Returns:
            bool: True if successful.
        """
-        pass
+        pass
--- a/src/alttext/langengine/privategpt.py
+++ b/src/alttext/langengine/privategpt.py
@ -2,6 +2,7 @@ import requests

 from .langengine import LangEngine

+
 class PrivateGPT(LangEngine):
    def __init__(self, host) -> None:
        self.host = host
--- a/src/alttext/ocrengine/ocrengine.py
+++ b/src/alttext/ocrengine/ocrengine.py
@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod

+
 class OCREngine(ABC):
    @abstractmethod
    def genChars(self, imgData: bytes, src: str, context: str = None) -> str:
@ -13,4 +14,4 @@ class OCREngine(ABC):
        Returns:
            str: Characters found in an image.
        """
-        pass
+        pass
--- a/tests/automate.py
+++ b/tests/automate.py
@ -0,0 +1,132 @@
+# automate.py - tests the generation of images and benchmarks the systems
+# run getbooks.py then downloadbooks.py with input (.txt file), use output for next steps
+
+import os
+import sys
+import time
+import csv
+
+import keys
+
+sys.path.append("../")
+from src.alttext.alttext import AltTextHTML
+from src.alttext.descengine.replicateapi import ReplicateAPI
+from src.alttext.ocrengine.tesseract import Tesseract
+from src.alttext.langengine.openaiapi import OpenAIAPI
+
+
+class AltTextGenerator(AltTextHTML):
+    # Use genAltTextV2
+    # ADD benchmark time stamps
+    def genAltTextV2(self, src: str, book_id, image_path, book_path) -> str:
+        print(f"PROCESSING BOOK {book_id} | IMAGE {image_path}")
+        status = False
+        # Start total timing
+        total_start_time = time.time()
+
+        imgdata = self.getImgData(src)
+        context = self.getContext(self.getImg(src))
+
+        # Description generation timing
+        # print("starting desc")
+        genDesc_start_time = time.time()
+        desc = self.genDesc(imgdata, src, context)
+        genDesc_end_time = time.time()
+        genDesc_total_time = genDesc_end_time - genDesc_start_time
+
+        # OCR processing timing
+        # print("starting ocr")
+        ocr_start_time = time.time()
+        chars = self.genChars(imgdata, src).strip()
+        ocr_end_time = time.time()
+        ocr_total_time = ocr_end_time - ocr_start_time
+
+        # Refinement processing timing
+        # print("starting refinement")
+        refine_start_time = time.time()
+        refined_desc = self.langEngine.refineAlt(desc, chars, context, None)
+        refine_end_time = time.time()
+        refine_total_time = refine_end_time - refine_start_time
+
+        # End total timing
+        total_end_time = time.time()
+        total_overall_time = total_end_time - total_start_time
+
+        # Record dictionary to store all the timing data
+        record = {
+            "book": book_id,
+            "image": image_path,
+            "path": book_path,
+            "status": status,  # Set false if failed, set true is worked
+            "beforeContext": context[0],
+            "afterContext": context[1],
+            "genDesc": desc,
+            "genDesc-Start": genDesc_start_time,
+            "genDesc-End": genDesc_end_time,
+            "genDesc-Time": genDesc_total_time,
+            "genOCR": chars,
+            "genOCR-Start": ocr_start_time,
+            "genOCR-End": ocr_end_time,
+            "genOCR-Time": ocr_total_time,
+            "refineDesc": refined_desc,
+            "refineDesc-Time": refine_total_time,
+            "totalTime": total_overall_time,
+        }
+
+        print(f"FINISHED BOOK {book_id} | IMAGE {image_path}")
+
+        return record
+
+
+def generateCSV(csv_file_path: str, benchmark_records: list[dict]):
+    fieldnames = benchmark_records[0].keys()
+
+    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for record in benchmark_records:
+            writer.writerow(record)
+
+    print(f"CSV file has been generated at: {csv_file_path}")
+    return
+
+
+def benchmarkBooks(booksDir: str, srcsDir: str):
+    generator = AltTextGenerator(
+        ReplicateAPI(keys.ReplicateEricKey()),
+        Tesseract(),
+        OpenAIAPI(keys.OpenAIKey(), "gpt-3.5-turbo"),
+    )
+
+    records = []
+    for bookId in os.listdir(booksDir):
+        try:
+            bookPath = os.path.join(booksDir, bookId)
+
+            htmlpath = None
+            for object in os.listdir(bookPath):
+                if object.endswith(".html"):
+                    htmlpath = os.path.join(bookPath, object)
+                    break
+            generator.parseFile(htmlpath)
+
+            srcs = []
+            with open(f"{srcsDir}/ebook_{bookId}.txt", "r") as file:
+                for line in file:
+                    srcs.append(line.split(f"{bookId}/")[1].strip())
+
+            for src in srcs:
+                try:
+                    record = generator.genAltTextV2(src, bookId, src, bookPath)
+                    records.append(record)
+                except Exception as e:
+                    print(f"Error processing image {src} in book {bookId}: {e}")
+        except Exception as e:
+            print(f"Error processing book {bookId}: {e}")
+
+    generateCSV("test_benchmark.csv", records)
+
+
+if __name__ == "__main__":
+    print("RUNNING AUTOMATE.PY")
+    benchmarkBooks("./downloaded_books", "./book_outputs")
--- a/tests/downloadbooks.py
+++ b/tests/downloadbooks.py
@ -0,0 +1,71 @@
+# The goal of this file is to download the books and unzip them to be used by automate.py!
+
+import os
+import requests
+import zipfile
+import re
+
+folder_path = "book_outputs"
+download_folder = "downloaded_books/download_files"
+extraction_folder = "downloaded_books"
+
+
+def download_and_unzip_books(folder_path, download_folder, extraction_folder):
+    base_url = "https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}-h.zip"
+
+    # Ensure the download and extraction folders exist
+    if not os.path.exists(download_folder):
+        os.makedirs(download_folder)
+    if not os.path.exists(extraction_folder):
+        os.makedirs(extraction_folder)
+
+    # Iterate through each text file in the folder
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".txt"):
+            # Use regex to extract only the numeric part of the book ID
+            match = re.search(r"\d+", filename)
+            if match:
+                book_id = match.group()
+                zip_file_path = os.path.join(download_folder, f"{book_id}.zip")
+
+                # Check if the zip file already exists
+                if not os.path.isfile(zip_file_path):
+                    url = base_url.format(book_id=book_id)
+
+                    # Download the zip file
+                    try:
+                        response = requests.get(url)
+                        response.raise_for_status()  # Raise an error for bad responses
+
+                        # Save the zip file to the specified download folder
+                        with open(zip_file_path, "wb") as zip_file:
+                            zip_file.write(response.content)
+                        print(
+                            f"Downloaded {book_id}.zip successfully to {download_folder}."
+                        )
+                    except requests.RequestException as e:
+                        print(f"Error downloading {book_id}.zip: {e}")
+                else:
+                    print(f"{book_id}.zip already exists. Skipping download.")
+
+                # Check if the book's extraction folder already exists
+                book_extraction_folder = os.path.join(extraction_folder, book_id)
+                if not os.path.exists(book_extraction_folder):
+                    try:
+                        # Unzip the file into the specified extraction folder
+                        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+                            zip_ref.extractall(book_extraction_folder)
+                        print(f"Extracted {book_id}.zip to {book_extraction_folder}.")
+                    except zipfile.BadZipFile:
+                        print(
+                            f"Error unzipping {book_id}.zip: The file may be corrupt or not a zip file."
+                        )
+                else:
+                    print(
+                        f"Extraction folder for {book_id} already exists. Skipping extraction."
+                    )
+            else:
+                print(f"No book ID found in {filename}")
+
+
+download_and_unzip_books(folder_path, download_folder, extraction_folder)
--- a/tests/getbooks.py
+++ b/tests/getbooks.py
@ -0,0 +1,34 @@
+# Used to chunk the empty_alt_text.txt into multiple different more digestable .txt files
+# Will potentially eventually be used to upload from the file right into a database of books
+# Then will update the file paths, download & install the books with images
+
+import os
+
+input_file = "./empty_alt_text_sample.txt"  # The file path of whatever initial .txt you are working with
+output_folder = "./book_outputs"
+
+
+def createIndividualBookFiles(input_file, output_folder):
+    # Ensure the output folder exists
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # Keep track of the last book number processed
+    last_book_number = None
+
+    with open(input_file, "r") as file:
+        for line in file:
+            book_number = line.split()[0]  # Extracting book number
+            # Check if this line is for a new book
+            if book_number != last_book_number:
+                output_file_name = f"ebook_{book_number}.txt"
+                output_path = os.path.join(output_folder, output_file_name)
+                # print(f"Creating/Updating file for book {book_number}")
+                last_book_number = book_number
+
+            # Append to the file (creates a new file if it doesn't exist)
+            with open(output_path, "a") as output_file:
+                output_file.write(line)
+
+
+createIndividualBookFiles(input_file, output_folder)