automate.py not working

dev-ethan
kethan351 2024-02-15 17:17:30 -05:00
parent 7277db936d
commit 87505bb018
2 changed files with 56 additions and 34 deletions

View File

@ -1,44 +1,67 @@
#This file will be the actual generation of images and benchmarking of the system
#Run getbooks.py then downloadbooks.py with whatever .txt is being used then use those to move into the next steps
import os import os
from pathlib import Path import bs4
from alttext import genAltTextV2 from bs4 import BeautifulSoup
from descengine import genDesc import time
from ocrengine import genChars from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
from langengine import refineDesc, refineOCR #need to implement these from ..src.alttext.langengine import refineAlt
def read_paths_from_file(file_path): class BookParser:
"""Reads image paths from a given file and returns a list of tuples containing book number and path.""" def __init__(self):
with open(file_path, 'r') as file: self.filepath = ""
lines = file.readlines() self.filename = ""
paths = [line.strip().split('\t') for line in lines] self.filedir = ""
return paths
def generate_alt_text_for_images(image_paths): def parse(self, html):
""" # Parse the HTML content with BeautifulSoup
Generates alt-text for a list of image paths. Each path is a tuple containing the book number and the image path. return BeautifulSoup(html, 'html.parser')
"""
alt_texts = []
for path_info in image_paths:
book_num, image_path = path_info.split('\t')
full_image_path = f"cache/epub/{book_num}/images/{image_path}"
# Generate alt-text using the genAltTextV2 method def parseFile(self, filepath: str) -> bs4.BeautifulSoup:
alt_text = alt_text.genAltTextV2(full_image_path) #I don't think I am doing this right with open(filepath, encoding="utf8") as html:
self.filepath = filepath
l = filepath.split("/")
self.filename = l.pop()
self.filedir = "/".join(l) + "/"
return self.parse(html)
alt_texts.append((book_num, image_path, alt_text)) def process_books(extraction_folder):
parser = BookParser()
return alt_texts # Iterate through each book's directory
for book_id in os.listdir(extraction_folder):
book_path = os.path.join(extraction_folder, book_id)
if os.path.isdir(book_path):
# Iterate through files in the book's directory
for filename in os.listdir(book_path):
filepath = os.path.join(book_path, filename)
# Check if the file is an HTML file
if filepath.endswith(".html"):
# Use the parseFile method to parse the HTML file
soup = parser.parseFile(filepath)
# Now `soup` contains the parsed HTML file for further processing
def main(): # Example of further processing: print the title of the HTML document
input_file = '../empty_alt_text_sample.text' # Update this path title = soup.find('title').get_text() if soup.find('title') else 'No title'
output_file = '../generated_alt_texts.txt' # Update this path print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
image_paths = read_paths_from_file(input_file) #Use genAltTextV2
alt_texts = generate_alt_text_for_images(image_paths) #ADD benchmark time stamps
def genAltTextV2(self, src: str) -> str:
imgdata = self.getImgData(src)
context = [None, None]
if self.options["withContext"]:
context = self.getContext(self.getImg(src))
desc = self.genDesc(imgdata, src, context)
with open(output_file, 'w') as file: chars = ""
for alt_text in alt_texts: if self.ocrEngine != None:
file.write(f'{alt_text}\n') chars = self.genChars(imgdata, src).strip()
if __name__ == '__main__': if self.langEngine == None:
main() raise Exception("To use version 2, you must have a langEngine set.")
return self.langEngine.refineAlt(desc, chars, context, None)
#Add .csv generation for benchmark variables

View File

@ -5,7 +5,6 @@
import os import os
input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with
n = 5 #Constant number of books to be iterated through
output_folder = 'book_outputs' output_folder = 'book_outputs'
def create_individual_book_files(input_file, output_folder): def create_individual_book_files(input_file, output_folder):