automate work

2024-02-16 14:48:55 -05:00 · 2024-02-16 14:48:55 -05:00 · e5c5d03d40
parent 131fd07779
commit e5c5d03d40
1 changed files with 54 additions and 49 deletions
--- a/tests/automate.py
+++ b/tests/automate.py
@ -1,54 +1,28 @@
-#This file will be the actual generation of images and benchmarking of the system
+# automate.py - tests the generation of images and benchmarks the systems
 # run getbooks.py then downloadbooks.py with input (.txt file), use output for next steps
-#Run getbooks.py then downloadbooks.py with whatever .txt is being used then use those to move into the next steps
+# imports
 import os
 import time
 import csv
 import bs4
 from bs4 import BeautifulSoup
-import time
+from ..src.alttext.alttext import AltTextHTML
-from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
+from ..src.alttext.langengine import PrivateGPT
-from ..src.alttext.langengine import refineAlt
+
-import csv
+# access downloaded books and go thru all of them
 # 1. parse html file to find img src to get the before and after context (using get context funct)
 # 2. generate alt text using genAltTextV2 (add benchmarking at some point)
 # 3. save alt text and benchmarking in a csv (see csv file headings)
 # iterate thru downloaded_books folder, pass html into parseFile
 class AltTextGenerator(AltTextHTML):
    # uses the class from alttext.py
    # adds relevant benchmarking and saving methods
 class BookParser:
    def __init__(self):
        self.filepath = ""
        self.filename = ""
        self.filedir = ""
    def parse(self, html):
        # Parse the HTML content with BeautifulSoup
        return BeautifulSoup(html, 'html.parser')
    def parseFile(self, filepath: str) -> bs4.BeautifulSoup:
        with open(filepath, encoding="utf8") as html:
            self.filepath = filepath
            l = filepath.split("/")
            self.filename = l.pop()
            self.filedir = "/".join(l) + "/"
            return self.parse(html)
 def process_books(extraction_folder):
    parser = BookParser()
    # Iterate through each book's directory
    for book_id in os.listdir(extraction_folder):
        book_path = os.path.join(extraction_folder, book_id)
        if os.path.isdir(book_path):
            # Iterate through files in the book's directory
            for filename in os.listdir(book_path):
                filepath = os.path.join(book_path, filename)
                # Check if the file is an HTML file
                if filepath.endswith(".html"):
                    # Use the parseFile method to parse the HTML file
                    soup = parser.parseFile(filepath)
                    # Now `soup` contains the parsed HTML file for further processing
                    # Example of further processing: print the title of the HTML document
                    title = soup.find('title').get_text() if soup.find('title') else 'No title'
                    print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
 class AltTextGenerator:
    def __init__(self):
        super().__init__()
        self.benchmark_records = []
    #Use genAltTextV2
@ -114,7 +88,10 @@ class AltTextGenerator:
        return refined_desc
    #CSV generation
-    def generate_csv(benchmark_records, csv_file_path):
+    def generate_csv(self, csv_file_path, benchmark_records):
        if not benchmark_records:
            benchmark_records = self.benchmark_records
            if not benchmark_records:
                print("No benchmark data available.")
                return
@ -128,3 +105,31 @@ class AltTextGenerator:
            for record in benchmark_records:
                writer.writerow(record)
        print(f"CSV file has been generated at: {csv_file_path}")
 def automate_process(extr_folder : str):
    # Iterate through all images in a folder to produce a table (csv) with benchmarking
    generator = AltTextGenerator()
    # Iterate thru each book in folder (ex. downloaded_books)
    for book_id in os.listdir(extr_folder):
        book_path = os.path.join(extr_folder, book_id)
        if os.path.isdir(book_path):
            # Iterate thru files in the book's directory
            for filename in os.listdir(book_path):
                filepath = os.path.join(book_path, filename)
                # Check if the file is an HTML file
                if filepath.endswith(".html"):
                    # Use the parseFile method to parse the HTML file for the genAltText function
                    soup = generator.parseFile(filepath)
                    generator.genAltText(soup)
    generator.generate_csv('test_benchmark.csv', generator.benchmark_records)
 if __name__ == "__main__":
    print("Running automate.py")
    automate_process('downloaded_books')