automate.py not working
parent
7277db936d
commit
87505bb018
|
@ -1,44 +1,67 @@
|
||||||
|
#This file will be the actual generation of images and benchmarking of the system
|
||||||
|
|
||||||
|
#Run getbooks.py then downloadbooks.py with whatever .txt is being used then use those to move into the next steps
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
import bs4
|
||||||
from alttext import genAltTextV2
|
from bs4 import BeautifulSoup
|
||||||
from descengine import genDesc
|
import time
|
||||||
from ocrengine import genChars
|
from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
|
||||||
from langengine import refineDesc, refineOCR #need to implement these
|
from ..src.alttext.langengine import refineAlt
|
||||||
|
|
||||||
def read_paths_from_file(file_path):
|
class BookParser:
|
||||||
"""Reads image paths from a given file and returns a list of tuples containing book number and path."""
|
def __init__(self):
|
||||||
with open(file_path, 'r') as file:
|
self.filepath = ""
|
||||||
lines = file.readlines()
|
self.filename = ""
|
||||||
paths = [line.strip().split('\t') for line in lines]
|
self.filedir = ""
|
||||||
return paths
|
|
||||||
|
|
||||||
def generate_alt_text_for_images(image_paths):
|
def parse(self, html):
|
||||||
"""
|
# Parse the HTML content with BeautifulSoup
|
||||||
Generates alt-text for a list of image paths. Each path is a tuple containing the book number and the image path.
|
return BeautifulSoup(html, 'html.parser')
|
||||||
"""
|
|
||||||
alt_texts = []
|
|
||||||
for path_info in image_paths:
|
|
||||||
book_num, image_path = path_info.split('\t')
|
|
||||||
full_image_path = f"cache/epub/{book_num}/images/{image_path}"
|
|
||||||
|
|
||||||
# Generate alt-text using the genAltTextV2 method
|
def parseFile(self, filepath: str) -> bs4.BeautifulSoup:
|
||||||
alt_text = alt_text.genAltTextV2(full_image_path) #I don't think I am doing this right
|
with open(filepath, encoding="utf8") as html:
|
||||||
|
self.filepath = filepath
|
||||||
|
l = filepath.split("/")
|
||||||
|
self.filename = l.pop()
|
||||||
|
self.filedir = "/".join(l) + "/"
|
||||||
|
return self.parse(html)
|
||||||
|
|
||||||
alt_texts.append((book_num, image_path, alt_text))
|
def process_books(extraction_folder):
|
||||||
|
parser = BookParser()
|
||||||
|
|
||||||
return alt_texts
|
# Iterate through each book's directory
|
||||||
|
for book_id in os.listdir(extraction_folder):
|
||||||
|
book_path = os.path.join(extraction_folder, book_id)
|
||||||
|
if os.path.isdir(book_path):
|
||||||
|
# Iterate through files in the book's directory
|
||||||
|
for filename in os.listdir(book_path):
|
||||||
|
filepath = os.path.join(book_path, filename)
|
||||||
|
# Check if the file is an HTML file
|
||||||
|
if filepath.endswith(".html"):
|
||||||
|
# Use the parseFile method to parse the HTML file
|
||||||
|
soup = parser.parseFile(filepath)
|
||||||
|
# Now `soup` contains the parsed HTML file for further processing
|
||||||
|
|
||||||
def main():
|
# Example of further processing: print the title of the HTML document
|
||||||
input_file = '../empty_alt_text_sample.text' # Update this path
|
title = soup.find('title').get_text() if soup.find('title') else 'No title'
|
||||||
output_file = '../generated_alt_texts.txt' # Update this path
|
print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
|
||||||
|
|
||||||
image_paths = read_paths_from_file(input_file)
|
#Use genAltTextV2
|
||||||
alt_texts = generate_alt_text_for_images(image_paths)
|
#ADD benchmark time stamps
|
||||||
|
def genAltTextV2(self, src: str) -> str:
|
||||||
|
imgdata = self.getImgData(src)
|
||||||
|
context = [None, None]
|
||||||
|
if self.options["withContext"]:
|
||||||
|
context = self.getContext(self.getImg(src))
|
||||||
|
desc = self.genDesc(imgdata, src, context)
|
||||||
|
|
||||||
with open(output_file, 'w') as file:
|
chars = ""
|
||||||
for alt_text in alt_texts:
|
if self.ocrEngine != None:
|
||||||
file.write(f'{alt_text}\n')
|
chars = self.genChars(imgdata, src).strip()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if self.langEngine == None:
|
||||||
main()
|
raise Exception("To use version 2, you must have a langEngine set.")
|
||||||
|
|
||||||
|
return self.langEngine.refineAlt(desc, chars, context, None)
|
||||||
|
|
||||||
|
#Add .csv generation for benchmark variables
|
|
@ -5,7 +5,6 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with
|
input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with
|
||||||
n = 5 #Constant number of books to be iterated through
|
|
||||||
output_folder = 'book_outputs'
|
output_folder = 'book_outputs'
|
||||||
|
|
||||||
def create_individual_book_files(input_file, output_folder):
|
def create_individual_book_files(input_file, output_folder):
|
||||||
|
|
Loading…
Reference in New Issue