get,download,automate
parent
c0d2de4ed6
commit
507f261f7c
|
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
||||||
import time
|
import time
|
||||||
from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
|
from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars
|
||||||
from ..src.alttext.langengine import refineAlt
|
from ..src.alttext.langengine import refineAlt
|
||||||
|
import csv
|
||||||
|
|
||||||
class BookParser:
|
class BookParser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -46,22 +47,84 @@ def process_books(extraction_folder):
|
||||||
title = soup.find('title').get_text() if soup.find('title') else 'No title'
|
title = soup.find('title').get_text() if soup.find('title') else 'No title'
|
||||||
print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
|
print(f"Book ID: {book_id}, File: {filename}, Title: {title}")
|
||||||
|
|
||||||
#Use genAltTextV2
|
class AltTextGenerator:
|
||||||
#ADD benchmark time stamps
|
def __init__(self):
|
||||||
def genAltTextV2(self, src: str) -> str:
|
self.benchmark_records = []
|
||||||
|
|
||||||
|
#Use genAltTextV2
|
||||||
|
#ADD benchmark time stamps
|
||||||
|
def genAltTextV2(self, src: str) -> str:
|
||||||
|
# Start total timing
|
||||||
|
total_start_time = time.time()
|
||||||
|
|
||||||
|
# Image data extraction timing
|
||||||
|
imgdata_start_time = time.time()
|
||||||
imgdata = self.getImgData(src)
|
imgdata = self.getImgData(src)
|
||||||
|
imgdata_end_time = time.time()
|
||||||
|
imgdata_total_time = imgdata_end_time - imgdata_start_time
|
||||||
|
|
||||||
|
# Context extraction timing
|
||||||
context = [None, None]
|
context = [None, None]
|
||||||
|
context_start_time = time.time()
|
||||||
if self.options["withContext"]:
|
if self.options["withContext"]:
|
||||||
context = self.getContext(self.getImg(src))
|
context = self.getContext(self.getImg(src))
|
||||||
|
context_end_time = time.time()
|
||||||
|
context_total_time = context_end_time - context_start_time
|
||||||
|
beforeContext = context[0]
|
||||||
|
afterContext = context[1]
|
||||||
|
|
||||||
|
# Description generation timing
|
||||||
|
genDesc_start_time = time.time()
|
||||||
desc = self.genDesc(imgdata, src, context)
|
desc = self.genDesc(imgdata, src, context)
|
||||||
|
genDesc_end_time = time.time()
|
||||||
|
genDesc_total_time = genDesc_end_time - genDesc_start_time
|
||||||
|
|
||||||
|
# OCR processing timing
|
||||||
|
ocr_start_time = time.time()
|
||||||
chars = ""
|
chars = ""
|
||||||
if self.ocrEngine != None:
|
if self.ocrEngine is not None:
|
||||||
chars = self.genChars(imgdata, src).strip()
|
chars = self.genChars(imgdata, src).strip()
|
||||||
|
ocr_end_time = time.time()
|
||||||
|
ocr_total_time = ocr_end_time - ocr_start_time
|
||||||
|
|
||||||
if self.langEngine == None:
|
# Refinement processing timing
|
||||||
|
refine_start_time = time.time()
|
||||||
|
if self.langEngine is None:
|
||||||
raise Exception("To use version 2, you must have a langEngine set.")
|
raise Exception("To use version 2, you must have a langEngine set.")
|
||||||
|
refined_desc = self.langEngine.refineAlt(desc, chars, context, None)
|
||||||
|
refine_end_time = time.time()
|
||||||
|
refine_total_time = refine_end_time - refine_start_time
|
||||||
|
|
||||||
return self.langEngine.refineAlt(desc, chars, context, None)
|
# End total timing
|
||||||
|
total_end_time = time.time()
|
||||||
|
total_overall_time = total_end_time - total_start_time
|
||||||
|
|
||||||
#Add .csv generation for benchmark variables
|
#Record dictionary to store all the timing data
|
||||||
|
record = {
|
||||||
|
"Image Data Extraction Time": imgdata_total_time,
|
||||||
|
"Context Extraction Time": context_total_time,
|
||||||
|
"Description Generation Time": genDesc_total_time,
|
||||||
|
"OCR Processing Time": ocr_total_time,
|
||||||
|
"Refinement Processing Time": refine_total_time,
|
||||||
|
"Total Overall Time": total_overall_time
|
||||||
|
}
|
||||||
|
# Add record to benchmark_records for later CSV generation
|
||||||
|
self.benchmark_records.append(record)
|
||||||
|
|
||||||
|
return refined_desc
|
||||||
|
|
||||||
|
#CSV generation
|
||||||
|
def generate_csv(benchmark_records, csv_file_path):
|
||||||
|
if not benchmark_records:
|
||||||
|
print("No benchmark data available.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Determine the CSV field names from the keys of the first record
|
||||||
|
fieldnames = benchmark_records[0].keys()
|
||||||
|
|
||||||
|
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for record in benchmark_records:
|
||||||
|
writer.writerow(record)
|
||||||
|
print(f"CSV file has been generated at: {csv_file_path}")
|
Loading…
Reference in New Issue