get,download,automate

pull/19/head
kethan351 2024-02-16 12:09:45 -05:00
parent c0d2de4ed6
commit 507f261f7c
1 changed files with 70 additions and 7 deletions

View File

@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
import time import time
from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
from ..src.alttext.langengine import refineAlt from ..src.alttext.langengine import refineAlt
import csv
class BookParser: class BookParser:
def __init__(self): def __init__(self):
@ -46,22 +47,84 @@ def process_books(extraction_folder):
title = soup.find('title').get_text() if soup.find('title') else 'No title' title = soup.find('title').get_text() if soup.find('title') else 'No title'
print(f"Book ID: {book_id}, File: {filename}, Title: {title}") print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
class AltTextGenerator:
def __init__(self):
self.benchmark_records = []
#Use genAltTextV2 #Use genAltTextV2
#ADD benchmark time stamps #ADD benchmark time stamps
def genAltTextV2(self, src: str) -> str: def genAltTextV2(self, src: str) -> str:
# Start total timing
total_start_time = time.time()
# Image data extraction timing
imgdata_start_time = time.time()
imgdata = self.getImgData(src) imgdata = self.getImgData(src)
imgdata_end_time = time.time()
imgdata_total_time = imgdata_end_time - imgdata_start_time
# Context extraction timing
context = [None, None] context = [None, None]
context_start_time = time.time()
if self.options["withContext"]: if self.options["withContext"]:
context = self.getContext(self.getImg(src)) context = self.getContext(self.getImg(src))
context_end_time = time.time()
context_total_time = context_end_time - context_start_time
beforeContext = context[0]
afterContext = context[1]
# Description generation timing
genDesc_start_time = time.time()
desc = self.genDesc(imgdata, src, context) desc = self.genDesc(imgdata, src, context)
genDesc_end_time = time.time()
genDesc_total_time = genDesc_end_time - genDesc_start_time
# OCR processing timing
ocr_start_time = time.time()
chars = "" chars = ""
if self.ocrEngine != None: if self.ocrEngine is not None:
chars = self.genChars(imgdata, src).strip() chars = self.genChars(imgdata, src).strip()
ocr_end_time = time.time()
ocr_total_time = ocr_end_time - ocr_start_time
if self.langEngine == None: # Refinement processing timing
refine_start_time = time.time()
if self.langEngine is None:
raise Exception("To use version 2, you must have a langEngine set.") raise Exception("To use version 2, you must have a langEngine set.")
refined_desc = self.langEngine.refineAlt(desc, chars, context, None)
refine_end_time = time.time()
refine_total_time = refine_end_time - refine_start_time
return self.langEngine.refineAlt(desc, chars, context, None) # End total timing
total_end_time = time.time()
total_overall_time = total_end_time - total_start_time
#Add .csv generation for benchmark variables #Record dictionary to store all the timing data
record = {
"Image Data Extraction Time": imgdata_total_time,
"Context Extraction Time": context_total_time,
"Description Generation Time": genDesc_total_time,
"OCR Processing Time": ocr_total_time,
"Refinement Processing Time": refine_total_time,
"Total Overall Time": total_overall_time
}
# Add record to benchmark_records for later CSV generation
self.benchmark_records.append(record)
return refined_desc
#CSV generation
def generate_csv(benchmark_records, csv_file_path):
if not benchmark_records:
print("No benchmark data available.")
return
# Determine the CSV field names from the keys of the first record
fieldnames = benchmark_records[0].keys()
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for record in benchmark_records:
writer.writerow(record)
print(f"CSV file has been generated at: {csv_file_path}")