From b29bbc39f25fe49cf83bbd917b4ebe7fcc714787 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Fri, 9 Feb 2024 14:07:16 -0500 Subject: [PATCH 01/11] ethan branch --- generated_alt_texts.TXT | 0 src/alttext/automate.py | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 generated_alt_texts.TXT create mode 100644 src/alttext/automate.py diff --git a/generated_alt_texts.TXT b/generated_alt_texts.TXT new file mode 100644 index 0000000..e69de29 diff --git a/src/alttext/automate.py b/src/alttext/automate.py new file mode 100644 index 0000000..c81a9bb --- /dev/null +++ b/src/alttext/automate.py @@ -0,0 +1,44 @@ + +import os +from pathlib import Path +from alttext import genAltTextV2 +from descengine import genDesc +from ocrengine import genChars +from langengine import refineDesc, refineOCR #need to implement these + +def read_paths_from_file(file_path): + """Reads image paths from a given file and returns a list of tuples containing book number and path.""" + with open(file_path, 'r') as file: + lines = file.readlines() + paths = [line.strip().split('\t') for line in lines] + return paths + +def generate_alt_text_for_images(image_paths): + """ + Generates alt-text for a list of image paths. Each path is a tuple containing the book number and the image path. + """ + alt_texts = [] + for path_info in image_paths: + book_num, image_path = path_info.split('\t') + full_image_path = f"cache/epub/{book_num}/images/{image_path}" + + # Generate alt-text using the genAltTextV2 method + alt_text = alt_text.genAltTextV2(full_image_path) #I don't think I am doing this right + + alt_texts.append((book_num, image_path, alt_text)) + + return alt_texts + +def main(): + input_file = '../empty_alt_text_sample.text' # Update this path + output_file = '../generated_alt_texts.txt' # Update this path + + image_paths = read_paths_from_file(input_file) + alt_texts = generate_alt_text_for_images(image_paths) + + with open(output_file, 'w') as file: + for alt_text in alt_texts: + file.write(f'{alt_text}\n') + +if __name__ == '__main__': + main() From 3b34021362643b16caae56a31812a4ff53d39a56 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Wed, 14 Feb 2024 20:07:47 -0500 Subject: [PATCH 02/11] chunk and download books --- {src/alttext => tests}/automate.py | 0 tests/downloadbooks.py | 64 ++++++++++++++++++++++++++++++ tests/getbooks.py | 34 ++++++++++++++++ 3 files changed, 98 insertions(+) rename {src/alttext => tests}/automate.py (100%) create mode 100644 tests/downloadbooks.py create mode 100644 tests/getbooks.py diff --git a/src/alttext/automate.py b/tests/automate.py similarity index 100% rename from src/alttext/automate.py rename to tests/automate.py diff --git a/tests/downloadbooks.py b/tests/downloadbooks.py new file mode 100644 index 0000000..7035d51 --- /dev/null +++ b/tests/downloadbooks.py @@ -0,0 +1,64 @@ +#The goal of this file is to download the books and unzip them to be used by automate.py! + +import os +import requests +import zipfile +import re + +folder_path = "book_outputs" +download_folder = "downloaded_books/download_files" +extraction_folder = "downloaded_books" + +def download_and_unzip_books(folder_path, download_folder, extraction_folder): + base_url = "https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}-h.zip" + + # Ensure the download and extraction folders exist + if not os.path.exists(download_folder): + os.makedirs(download_folder) + if not os.path.exists(extraction_folder): + os.makedirs(extraction_folder) + + # Iterate through each text file in the folder + for filename in os.listdir(folder_path): + if filename.endswith(".txt"): + # Use regex to extract only the numeric part of the book ID + match = re.search(r'\d+', filename) + if match: + book_id = match.group() + zip_file_path = os.path.join(download_folder, f"{book_id}.zip") + + # Check if the zip file already exists + if not os.path.isfile(zip_file_path): + url = base_url.format(book_id=book_id) + + # Download the zip file + try: + response = requests.get(url) + response.raise_for_status() # Raise an error for bad responses + + # Save the zip file to the specified download folder + with open(zip_file_path, 'wb') as zip_file: + zip_file.write(response.content) + print(f"Downloaded {book_id}.zip successfully to {download_folder}.") + except requests.RequestException as e: + print(f"Error downloading {book_id}.zip: {e}") + else: + print(f"{book_id}.zip already exists. Skipping download.") + + # Check if the book's extraction folder already exists + book_extraction_folder = os.path.join(extraction_folder, book_id) + if not os.path.exists(book_extraction_folder): + try: + # Unzip the file into the specified extraction folder + with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: + zip_ref.extractall(book_extraction_folder) + print(f"Extracted {book_id}.zip to {book_extraction_folder}.") + except zipfile.BadZipFile: + print(f"Error unzipping {book_id}.zip: The file may be corrupt or not a zip file.") + else: + print(f"Extraction folder for {book_id} already exists. Skipping extraction.") + else: + print(f"No book ID found in {filename}") + +download_and_unzip_books(folder_path, download_folder, extraction_folder) + diff --git a/tests/getbooks.py b/tests/getbooks.py new file mode 100644 index 0000000..2ca278b --- /dev/null +++ b/tests/getbooks.py @@ -0,0 +1,34 @@ +#Used to chunk the empty_alt_text.txt into multiple different more digestable .txt files +#Will potentially eventually be used to upload from the file right into a database of books +#Then will update the file paths, download & install the books with images + +import os + +input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with +n = 5 #Constant number of books to be iterated through +output_folder = 'book_outputs' + +def create_individual_book_files(input_file, output_folder): + # Ensure the output folder exists + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # Keep track of the last book number processed + last_book_number = None + + with open(input_file, 'r') as file: + for line in file: + book_number = line.split()[0] # Extracting book number + # Check if this line is for a new book + if book_number != last_book_number: + output_file_name = f'ebook_{book_number}.txt' + output_path = os.path.join(output_folder, output_file_name) + #print(f"Creating/Updating file for book {book_number}") + last_book_number = book_number + + # Append to the file (creates a new file if it doesn't exist) + with open(output_path, 'a') as output_file: + output_file.write(line) + +create_individual_book_files(input_file, output_folder) + From c0d2de4ed6150a29a0ba02f70cc777c698ba6cdb Mon Sep 17 00:00:00 2001 From: kethan351 Date: Thu, 15 Feb 2024 17:17:30 -0500 Subject: [PATCH 03/11] automate.py not working --- tests/automate.py | 89 +++++++++++++++++++++++++++++------------------ tests/getbooks.py | 1 - 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/tests/automate.py b/tests/automate.py index c81a9bb..b1cba4d 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -1,44 +1,67 @@ +#This file will be the actual generation of images and benchmarking of the system +#Run getbooks.py then downloadbooks.py with whatever .txt is being used then use those to move into the next steps import os -from pathlib import Path -from alttext import genAltTextV2 -from descengine import genDesc -from ocrengine import genChars -from langengine import refineDesc, refineOCR #need to implement these +import bs4 +from bs4 import BeautifulSoup +import time +from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars +from ..src.alttext.langengine import refineAlt -def read_paths_from_file(file_path): - """Reads image paths from a given file and returns a list of tuples containing book number and path.""" - with open(file_path, 'r') as file: - lines = file.readlines() - paths = [line.strip().split('\t') for line in lines] - return paths +class BookParser: + def __init__(self): + self.filepath = "" + self.filename = "" + self.filedir = "" -def generate_alt_text_for_images(image_paths): - """ - Generates alt-text for a list of image paths. Each path is a tuple containing the book number and the image path. - """ - alt_texts = [] - for path_info in image_paths: - book_num, image_path = path_info.split('\t') - full_image_path = f"cache/epub/{book_num}/images/{image_path}" + def parse(self, html): + # Parse the HTML content with BeautifulSoup + return BeautifulSoup(html, 'html.parser') - # Generate alt-text using the genAltTextV2 method - alt_text = alt_text.genAltTextV2(full_image_path) #I don't think I am doing this right + def parseFile(self, filepath: str) -> bs4.BeautifulSoup: + with open(filepath, encoding="utf8") as html: + self.filepath = filepath + l = filepath.split("/") + self.filename = l.pop() + self.filedir = "/".join(l) + "/" + return self.parse(html) - alt_texts.append((book_num, image_path, alt_text)) +def process_books(extraction_folder): + parser = BookParser() - return alt_texts + # Iterate through each book's directory + for book_id in os.listdir(extraction_folder): + book_path = os.path.join(extraction_folder, book_id) + if os.path.isdir(book_path): + # Iterate through files in the book's directory + for filename in os.listdir(book_path): + filepath = os.path.join(book_path, filename) + # Check if the file is an HTML file + if filepath.endswith(".html"): + # Use the parseFile method to parse the HTML file + soup = parser.parseFile(filepath) + # Now `soup` contains the parsed HTML file for further processing -def main(): - input_file = '../empty_alt_text_sample.text' # Update this path - output_file = '../generated_alt_texts.txt' # Update this path + # Example of further processing: print the title of the HTML document + title = soup.find('title').get_text() if soup.find('title') else 'No title' + print(f"Book ID: {book_id}, File: {filename}, Title: {title}") - image_paths = read_paths_from_file(input_file) - alt_texts = generate_alt_text_for_images(image_paths) +#Use genAltTextV2 +#ADD benchmark time stamps +def genAltTextV2(self, src: str) -> str: + imgdata = self.getImgData(src) + context = [None, None] + if self.options["withContext"]: + context = self.getContext(self.getImg(src)) + desc = self.genDesc(imgdata, src, context) - with open(output_file, 'w') as file: - for alt_text in alt_texts: - file.write(f'{alt_text}\n') + chars = "" + if self.ocrEngine != None: + chars = self.genChars(imgdata, src).strip() -if __name__ == '__main__': - main() + if self.langEngine == None: + raise Exception("To use version 2, you must have a langEngine set.") + + return self.langEngine.refineAlt(desc, chars, context, None) + +#Add .csv generation for benchmark variables \ No newline at end of file diff --git a/tests/getbooks.py b/tests/getbooks.py index 2ca278b..5db1246 100644 --- a/tests/getbooks.py +++ b/tests/getbooks.py @@ -5,7 +5,6 @@ import os input_file = '../empty_alt_text_sample.TXT' #The file path of whatever initial .txt you are working with -n = 5 #Constant number of books to be iterated through output_folder = 'book_outputs' def create_individual_book_files(input_file, output_folder): From 507f261f7c961e0a5828586dbec636fd41c34ce2 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Fri, 16 Feb 2024 12:09:45 -0500 Subject: [PATCH 04/11] get,download,automate --- tests/automate.py | 77 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/tests/automate.py b/tests/automate.py index b1cba4d..c97b316 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup import time from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars from ..src.alttext.langengine import refineAlt +import csv class BookParser: def __init__(self): @@ -46,22 +47,84 @@ def process_books(extraction_folder): title = soup.find('title').get_text() if soup.find('title') else 'No title' print(f"Book ID: {book_id}, File: {filename}, Title: {title}") -#Use genAltTextV2 -#ADD benchmark time stamps -def genAltTextV2(self, src: str) -> str: +class AltTextGenerator: + def __init__(self): + self.benchmark_records = [] + + #Use genAltTextV2 + #ADD benchmark time stamps + def genAltTextV2(self, src: str) -> str: + # Start total timing + total_start_time = time.time() + + # Image data extraction timing + imgdata_start_time = time.time() imgdata = self.getImgData(src) + imgdata_end_time = time.time() + imgdata_total_time = imgdata_end_time - imgdata_start_time + + # Context extraction timing context = [None, None] + context_start_time = time.time() if self.options["withContext"]: context = self.getContext(self.getImg(src)) + context_end_time = time.time() + context_total_time = context_end_time - context_start_time + beforeContext = context[0] + afterContext = context[1] + + # Description generation timing + genDesc_start_time = time.time() desc = self.genDesc(imgdata, src, context) + genDesc_end_time = time.time() + genDesc_total_time = genDesc_end_time - genDesc_start_time + # OCR processing timing + ocr_start_time = time.time() chars = "" - if self.ocrEngine != None: + if self.ocrEngine is not None: chars = self.genChars(imgdata, src).strip() + ocr_end_time = time.time() + ocr_total_time = ocr_end_time - ocr_start_time - if self.langEngine == None: + # Refinement processing timing + refine_start_time = time.time() + if self.langEngine is None: raise Exception("To use version 2, you must have a langEngine set.") + refined_desc = self.langEngine.refineAlt(desc, chars, context, None) + refine_end_time = time.time() + refine_total_time = refine_end_time - refine_start_time - return self.langEngine.refineAlt(desc, chars, context, None) + # End total timing + total_end_time = time.time() + total_overall_time = total_end_time - total_start_time -#Add .csv generation for benchmark variables \ No newline at end of file + #Record dictionary to store all the timing data + record = { + "Image Data Extraction Time": imgdata_total_time, + "Context Extraction Time": context_total_time, + "Description Generation Time": genDesc_total_time, + "OCR Processing Time": ocr_total_time, + "Refinement Processing Time": refine_total_time, + "Total Overall Time": total_overall_time + } + # Add record to benchmark_records for later CSV generation + self.benchmark_records.append(record) + + return refined_desc + + #CSV generation + def generate_csv(benchmark_records, csv_file_path): + if not benchmark_records: + print("No benchmark data available.") + return + + # Determine the CSV field names from the keys of the first record + fieldnames = benchmark_records[0].keys() + + with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for record in benchmark_records: + writer.writerow(record) + print(f"CSV file has been generated at: {csv_file_path}") \ No newline at end of file From 14ed54b857c8b1b59fc36906b3ec2da092d4c3ba Mon Sep 17 00:00:00 2001 From: kethan351 Date: Fri, 16 Feb 2024 14:48:55 -0500 Subject: [PATCH 05/11] automate work --- tests/automate.py | 103 ++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/tests/automate.py b/tests/automate.py index c97b316..1133165 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -1,54 +1,28 @@ -#This file will be the actual generation of images and benchmarking of the system +# automate.py - tests the generation of images and benchmarks the systems +# run getbooks.py then downloadbooks.py with input (.txt file), use output for next steps -#Run getbooks.py then downloadbooks.py with whatever .txt is being used then use those to move into the next steps +# imports import os +import time +import csv import bs4 from bs4 import BeautifulSoup -import time -from ..src.alttext.alttext import getImgData, getContext, genDesc, genChars -from ..src.alttext.langengine import refineAlt -import csv +from ..src.alttext.alttext import AltTextHTML +from ..src.alttext.langengine import PrivateGPT + +# access downloaded books and go thru all of them +# 1. parse html file to find img src to get the before and after context (using get context funct) +# 2. generate alt text using genAltTextV2 (add benchmarking at some point) +# 3. save alt text and benchmarking in a csv (see csv file headings) + +# iterate thru downloaded_books folder, pass html into parseFile + +class AltTextGenerator(AltTextHTML): + # uses the class from alttext.py + # adds relevant benchmarking and saving methods -class BookParser: - def __init__(self): - self.filepath = "" - self.filename = "" - self.filedir = "" - - def parse(self, html): - # Parse the HTML content with BeautifulSoup - return BeautifulSoup(html, 'html.parser') - - def parseFile(self, filepath: str) -> bs4.BeautifulSoup: - with open(filepath, encoding="utf8") as html: - self.filepath = filepath - l = filepath.split("/") - self.filename = l.pop() - self.filedir = "/".join(l) + "/" - return self.parse(html) - -def process_books(extraction_folder): - parser = BookParser() - - # Iterate through each book's directory - for book_id in os.listdir(extraction_folder): - book_path = os.path.join(extraction_folder, book_id) - if os.path.isdir(book_path): - # Iterate through files in the book's directory - for filename in os.listdir(book_path): - filepath = os.path.join(book_path, filename) - # Check if the file is an HTML file - if filepath.endswith(".html"): - # Use the parseFile method to parse the HTML file - soup = parser.parseFile(filepath) - # Now `soup` contains the parsed HTML file for further processing - - # Example of further processing: print the title of the HTML document - title = soup.find('title').get_text() if soup.find('title') else 'No title' - print(f"Book ID: {book_id}, File: {filename}, Title: {title}") - -class AltTextGenerator: def __init__(self): + super().__init__() self.benchmark_records = [] #Use genAltTextV2 @@ -114,10 +88,13 @@ class AltTextGenerator: return refined_desc #CSV generation - def generate_csv(benchmark_records, csv_file_path): + def generate_csv(self, csv_file_path, benchmark_records): if not benchmark_records: - print("No benchmark data available.") - return + benchmark_records = self.benchmark_records + + if not benchmark_records: + print("No benchmark data available.") + return # Determine the CSV field names from the keys of the first record fieldnames = benchmark_records[0].keys() @@ -127,4 +104,32 @@ class AltTextGenerator: writer.writeheader() for record in benchmark_records: writer.writerow(record) - print(f"CSV file has been generated at: {csv_file_path}") \ No newline at end of file + print(f"CSV file has been generated at: {csv_file_path}") + +def automate_process(extr_folder : str): + # Iterate through all images in a folder to produce a table (csv) with benchmarking + + generator = AltTextGenerator() + + # Iterate thru each book in folder (ex. downloaded_books) + for book_id in os.listdir(extr_folder): + book_path = os.path.join(extr_folder, book_id) + if os.path.isdir(book_path): + + # Iterate thru files in the book's directory + for filename in os.listdir(book_path): + filepath = os.path.join(book_path, filename) + + # Check if the file is an HTML file + if filepath.endswith(".html"): + + # Use the parseFile method to parse the HTML file for the genAltText function + soup = generator.parseFile(filepath) + generator.genAltText(soup) + + generator.generate_csv('test_benchmark.csv', generator.benchmark_records) + +if __name__ == "__main__": + print("Running automate.py") + + automate_process('downloaded_books') \ No newline at end of file From 075914cafdddb852d46a25365165c8d6c7cc26eb Mon Sep 17 00:00:00 2001 From: kethan351 Date: Fri, 1 Mar 2024 12:22:06 -0500 Subject: [PATCH 06/11] automate debugging --- generated_alt_texts.TXT | 0 src/alttext/alttext.py | 6 +++--- tests/automate.py | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) delete mode 100644 generated_alt_texts.TXT diff --git a/generated_alt_texts.TXT b/generated_alt_texts.TXT deleted file mode 100644 index e69de29..0000000 diff --git a/src/alttext/alttext.py b/src/alttext/alttext.py index 7bec3d7..9ef8350 100644 --- a/src/alttext/alttext.py +++ b/src/alttext/alttext.py @@ -83,7 +83,7 @@ class AltText(ABC): # PARSING METHODS @abstractmethod - def parse(self, data: str) -> bs4.BeautifulSoup | epub.EpubBook: + def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: """Parses data into a BeautifulSoup or EpubBook object. Args: @@ -95,7 +95,7 @@ class AltText(ABC): pass @abstractmethod - def parseFile(self, filepath: str) -> bs4.BeautifulSoup | epub.EpubBook: + def parseFile(self, filepath: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: """Parses data from a file into a BeautifulSoup or EpubBook object. Args: @@ -162,7 +162,7 @@ class AltText(ABC): pass @abstractmethod - def export(self) -> str | epub.EpubBook: + def export(self) -> typing.Union[str, epub.EpubBook]: """Exports the current data. Returns: diff --git a/tests/automate.py b/tests/automate.py index 1133165..3be6f2f 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -3,12 +3,15 @@ # imports import os +import sys import time import csv import bs4 from bs4 import BeautifulSoup -from ..src.alttext.alttext import AltTextHTML -from ..src.alttext.langengine import PrivateGPT +import importlib +sys.path.append("c:/Users/ketha/Code/Senior D") #This will need to be changed system to system +AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTML +PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine").PrivateGPT # access downloaded books and go thru all of them # 1. parse html file to find img src to get the before and after context (using get context funct) From bedb5245460d76415c5b4b572595433cc31610e3 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Thu, 7 Mar 2024 16:32:11 -0500 Subject: [PATCH 07/11] reworking with changes in alttext --- src/alttext/alttext.py | 25 +- src/alttext/descengine/bliplocal.py | 29 + src/alttext/{ => descengine}/descengine.py | 0 src/alttext/descengine/googlevertexapi.py | 37 ++ src/alttext/descengine/replicateapi.py | 51 ++ src/alttext/{ => langengine}/langengine.py | 0 src/alttext/langengine/privategpt.py | 119 ++++ src/alttext/{ => ocrengine}/ocrengine.py | 0 src/alttext/ocrengine/tesseract.py | 20 + tests/alttext_old_semi_working.py | 687 +++++++++++++++++++++ tests/automate.py | 45 +- 11 files changed, 983 insertions(+), 30 deletions(-) create mode 100644 src/alttext/descengine/bliplocal.py rename src/alttext/{ => descengine}/descengine.py (100%) create mode 100644 src/alttext/descengine/googlevertexapi.py create mode 100644 src/alttext/descengine/replicateapi.py rename src/alttext/{ => langengine}/langengine.py (100%) create mode 100644 src/alttext/langengine/privategpt.py rename src/alttext/{ => ocrengine}/ocrengine.py (100%) create mode 100644 src/alttext/ocrengine/tesseract.py create mode 100644 tests/alttext_old_semi_working.py diff --git a/src/alttext/alttext.py b/src/alttext/alttext.py index 9ef8350..65e9271 100644 --- a/src/alttext/alttext.py +++ b/src/alttext/alttext.py @@ -6,9 +6,10 @@ import bs4 import ebooklib from ebooklib import epub -from .descengine import DescEngine -from .ocrengine import OCREngine -from .langengine import LangEngine + +from descengine import DescEngine +from ocrengine import OCREngine +from langengine import LangEngine DEFOPTIONS = { @@ -83,7 +84,7 @@ class AltText(ABC): # PARSING METHODS @abstractmethod - def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: + def parse(self, data: str) -> bs4.BeautifulSoup | epub.EpubBook: """Parses data into a BeautifulSoup or EpubBook object. Args: @@ -95,7 +96,7 @@ class AltText(ABC): pass @abstractmethod - def parseFile(self, filepath: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: + def parseFile(self, filepath: str) -> bs4.BeautifulSoup | epub.EpubBook: """Parses data from a file into a BeautifulSoup or EpubBook object. Args: @@ -162,7 +163,7 @@ class AltText(ABC): pass @abstractmethod - def export(self) -> typing.Union[str, epub.EpubBook]: + def export(self) -> str | epub.EpubBook: """Exports the current data. Returns: @@ -428,8 +429,8 @@ class AltTextHTML(AltText): def parseFile(self, filepath: str) -> bs4.BeautifulSoup: with open(filepath, encoding="utf8") as html: - self.filepath = filepath - l = filepath.split("/") + self.filepath = filepath.replace("\\", "/") + l = self.filepath.split("/") self.filename = l.pop() self.filedir = "/".join(l) + "/" return self.parse(html) @@ -516,20 +517,17 @@ class AltTextHTML(AltText): text = elem.text.strip() context[0] = text except: - print("error 0") context[0] = None elem = tag text = "" try: text = elem.text.strip() while text == "": - elem = elem.previous_element + elem = elem.next_element text = elem.text.strip() context[1] = text except: - print("error 1") context[1] = None - print(context) return context def genChars(self, imgData: bytes, src: str) -> str: @@ -564,7 +562,6 @@ class AltTextHTML(AltText): if self.options["withContext"]: context = self.getContext(self.getImg(src)) desc = self.genDesc(imgdata, src, context) - chars = "" if self.ocrEngine != None: chars = self.genChars(imgdata, src).strip() @@ -684,4 +681,4 @@ class AltTextEPUB(AltText): def exportToFile(self, path: str) -> str: epub.write_epub(path, self.export()) - return path + return path \ No newline at end of file diff --git a/src/alttext/descengine/bliplocal.py b/src/alttext/descengine/bliplocal.py new file mode 100644 index 0000000..5f6417d --- /dev/null +++ b/src/alttext/descengine/bliplocal.py @@ -0,0 +1,29 @@ +import os +import shutil +import subprocess +import uuid + +from .descengine import DescEngine + +class BlipLocal(DescEngine): + def __init__(self, path: str) -> None: + self.__setPath(path) + return None + + def __setPath(self, path: str) -> str: + self.path = path + return self.path + + def genDesc(self, imgData: bytes, src: str, context: str = None) -> str: + folderName = uuid.uuid4() + ext = src.split(".")[-1] + os.makedirs(f"{self.path}/{folderName}") + open(f"{self.path}/{folderName}/image.{ext}", "wb+").write(imgData) + subprocess.call( + f"py inference.py -i ./{folderName} --batch 1 --gpu 0", + cwd=f"{self.path}", + ) + desc = open(f"{self.path}/{folderName}/0_captions.txt", "r").read() + shutil.rmtree(f"{self.path}/{folderName}") + desc = desc.split(",") + return desc[1] \ No newline at end of file diff --git a/src/alttext/descengine.py b/src/alttext/descengine/descengine.py similarity index 100% rename from src/alttext/descengine.py rename to src/alttext/descengine/descengine.py diff --git a/src/alttext/descengine/googlevertexapi.py b/src/alttext/descengine/googlevertexapi.py new file mode 100644 index 0000000..881583d --- /dev/null +++ b/src/alttext/descengine/googlevertexapi.py @@ -0,0 +1,37 @@ +import os +import vertexai +from vertexai.vision_models import ImageTextModel, Image + +from .descengine import DescEngine + +class GoogleVertexAPI(DescEngine): + def __init__(self, project_id: str, location: str, gac_path: str) -> None: + self.project_id = project_id + self.location = location + vertexai.init(project=self.project_id, location=self.location) + + self.gac_path = gac_path + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.gac_path + return None + + def __setProject(self, project_id: str): + self.project_id = project_id + vertexai.init(project=self.project_id, location=self.location) + + def __setLocation(self, location: str): + self.location = location + vertexai.init(project=self.project_id, location=self.location) + + def __setGAC(self, gac_path: str): + self.gac_path = gac_path + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.gac_path + + def genDesc(self, imgData: bytes, src: str, context: str = None) -> str: + model = ImageTextModel.from_pretrained("imagetext@001") + source_image = Image(imgData) + captions = model.get_captions( + image=source_image, + number_of_results=1, + language="en", + ) + return captions[0] \ No newline at end of file diff --git a/src/alttext/descengine/replicateapi.py b/src/alttext/descengine/replicateapi.py new file mode 100644 index 0000000..6483360 --- /dev/null +++ b/src/alttext/descengine/replicateapi.py @@ -0,0 +1,51 @@ +import replicate +import base64 +import os + +from .descengine import DescEngine + +REPLICATE_MODELS = { + "blip": "salesforce/blip:2e1dddc8621f72155f24cf2e0adbde548458d3cab9f00c0139eea840d0ac4746", + "clip_prefix_caption": "rmokady/clip_prefix_caption:9a34a6339872a03f45236f114321fb51fc7aa8269d38ae0ce5334969981e4cd8", + "clip-caption-reward": "j-min/clip-caption-reward:de37751f75135f7ebbe62548e27d6740d5155dfefdf6447db35c9865253d7e06", + "img2prompt": "methexis-inc/img2prompt:50adaf2d3ad20a6f911a8a9e3ccf777b263b8596fbd2c8fc26e8888f8a0edbb5", + "minigpt4": "daanelson/minigpt-4:b96a2f33cc8e4b0aa23eacfce731b9c41a7d9466d9ed4e167375587b54db9423", + "image-captioning-with-visual-attention": "nohamoamary/image-captioning-with-visual-attention:9bb60a6baa58801aa7cd4c4fafc95fcf1531bf59b84962aff5a718f4d1f58986", +} + +class ReplicateAPI(DescEngine): + def __init__(self, key: str, model: str = "blip") -> None: + self.__setKey(key) + self.__setModel(model) + return None + + def __getModel(self) -> str: + return self.model + + def __setModel(self, modelName: str) -> str: + if modelName not in REPLICATE_MODELS: + raise Exception( + f"{modelName} is not a valid model. Please choose from {list(REPLICATE_MODELS.keys())}" + ) + self.model = REPLICATE_MODELS[modelName] + return self.model + + def __getKey(self) -> str: + return self.key + + def __setKey(self, key: str) -> str: + self.key = key + os.environ["REPLICATE_API_TOKEN"] = key + return self.key + + def genDesc(self, imgData: bytes, src: str, context: str = None) -> str: + base64_utf8_str = base64.b64encode(imgData).decode("utf-8") + model = self.__getModel() + ext = src.split(".")[-1] + prompt = "Create alternative-text for this image." + if context != None: + prompt = f"Create alternative-text for this image given the following context...\n{context}" + + dataurl = f"data:image/{ext};base64,{base64_utf8_str}" + output = replicate.run(model, input={"image": dataurl, "prompt": prompt}) + return output \ No newline at end of file diff --git a/src/alttext/langengine.py b/src/alttext/langengine/langengine.py similarity index 100% rename from src/alttext/langengine.py rename to src/alttext/langengine/langengine.py diff --git a/src/alttext/langengine/privategpt.py b/src/alttext/langengine/privategpt.py new file mode 100644 index 0000000..4a79d3b --- /dev/null +++ b/src/alttext/langengine/privategpt.py @@ -0,0 +1,119 @@ +import requests + +from .langengine import LangEngine + +class PrivateGPT(LangEngine): + def __init__(self, host) -> None: + self.host = host + + def __setHost(self, host) -> bool: + self.host = host + return True + + def _completion(self, prompt: str) -> str: + body = { + "include_sources": False, + "prompt": prompt, + "stream": False, + "use_context": False, + } + r = requests.post(f"{self.host}/v1/completions", json=body) + r = r.json() + return r["choices"][0]["message"]["content"].strip() + + def refineDesc(self, description: str) -> str: + prompt = f"""The following string surrounded with '///' was generated by an Image Captioning AI when ran on some arbitrary image. +///{description}/// + +Your goal is to refine the string to be inserted as alt-text for an image in an Ebook. + +Here are guidelines to follow... +1. Prioritize information in text alternative: +Aim to put the most important information at the beginning. +2. Length of the text alternative: +The alt text should be the most concise description possible of the image's purpose. If anything more than a short phrase or sentence is needed, it would be better to use one of the long description methods discussed in complex images. +3. Superfluous information in the text alternative: +Usually, there's no need to include words like “image”, “icon”, or “picture” in the alt text. People who can see will know this already, and screen readers announce the presence of an image. In some situations, it may be important to distinguish between paintings, photographs, or illustrations, etc., but it's best to avoid the more generic use of the terms. + +Format your response as... +The refined string is: + +If the string is empty, simply respond with... +The refined string is: N/A""" + return self._completion(prompt) + + def refineOCR(self, chars: str) -> str: + prompt = f"""The following string surrounded with '///' was generated by an Optical Character Recognition software when ran on some arbitrary image. +/// +{chars} +/// + +Your goal is to refine the string. +There may be random/excess spaces or other characters in the string, please remove them. +Do not surround the refined string in quotation marks. + +Format your response as... +The refined string is: + +If the string is empty, simply respond with... +The refined string is: N/A""" + return self._completion(prompt) + + def genPrompt(self, desc: str, chars: str, context: list[str], caption: str) -> str: + ocr = "" + if chars != None and chars != "": + ocr = f"\nThe following string surrounded with '///' was generated by an Optical Character Recognition software when ran on the image.\n///{chars}///" + before = "" + if context[0] != None and context[0] != "": + before = f"\nThe following string surrounded with '///' is the nearest text found before the image.\n///{context[0]}///" + after = "" + if context[1] != None and context[1] != "": + after = f"\nThe following string surrounded with '///' is the nearest text found after the image.\n///{context[1]}///" + cap = "" + if caption != None and caption != "": + cap = f"\nThe following string surrounded with '///' is a caption in the Ebook for the image.\n///{caption}///" + + prompt = f"""There following information is regarding an image found in an Ebook with no alternative-text. +The following string surrounded with '///' was generated by an Image Captioning AI when ran on the image. +///{desc}///{ocr}{cap}{before}{after} + +Your goal is to create alternative-text for the image given the prior information. + +Here are guidelines to follow to create quality alt-text... +1. Prioritize information in text alternative: +Aim to put the most important information at the beginning. +2. Length of the text alternative: +The alt text should be the most concise description possible of the image's purpose. If anything more than a short phrase or sentence is needed, it would be better to use one of the long description methods discussed in complex images. +3. Superfluous information in the text alternative: +Usually, there's no need to include words like “image”, “icon”, or “picture” in the alt text. People who can see will know this already, and screen readers announce the presence of an image. In some situations, it may be important to distinguish between paintings, photographs, or illustrations, etc., but it's best to avoid the more generic use of the terms. + +Using all of the information stated, please generate alt-text for the image. +In your response, please only give the alt-text.""" + return prompt + + def refineAlt( + self, + desc: str, + chars: str = None, + context: list[str] = None, + caption: str = None, + ) -> str: + prompt = self.genPrompt( + desc, + chars, + context, + caption, + ) + return self._completion(prompt) + + def ingest(self, filename: str, binary) -> bool: + ext = filename.split(".")[1] + files = {"file": (filename, binary, f"application/{ext}")} + headers = {"accept": "application/json"} + r = requests.post(f"{self.host}/v1/ingest", files=files, headers=headers) + return True + + def degest(self, filename: str) -> bool: + headers = {"accept": "application/json"} + r = requests.delete(f"{self.host}/v1/ingest/{filename}", headers=headers) + return True \ No newline at end of file diff --git a/src/alttext/ocrengine.py b/src/alttext/ocrengine/ocrengine.py similarity index 100% rename from src/alttext/ocrengine.py rename to src/alttext/ocrengine/ocrengine.py diff --git a/src/alttext/ocrengine/tesseract.py b/src/alttext/ocrengine/tesseract.py new file mode 100644 index 0000000..6901b20 --- /dev/null +++ b/src/alttext/ocrengine/tesseract.py @@ -0,0 +1,20 @@ +from PIL import Image +from io import BytesIO +import pytesseract + +from .ocrengine import OCREngine + +class Tesseract(OCREngine): + def __init__(self, path:str = None) -> None: + if (path != None): + self._setTesseract(path) + return None + + def _setTesseract(self, path: str) -> bool: + self.customPath = path + pytesseract.pytesseract.tesseract_cmd = path + return True + + def genChars(self, imgData: bytes, src: str, context: str = None) -> str: + image = Image.open(BytesIO(imgData)) + return pytesseract.image_to_string(image) \ No newline at end of file diff --git a/tests/alttext_old_semi_working.py b/tests/alttext_old_semi_working.py new file mode 100644 index 0000000..9ef8350 --- /dev/null +++ b/tests/alttext_old_semi_working.py @@ -0,0 +1,687 @@ +from abc import ABC, abstractmethod +import typing +from threading import Thread + +import bs4 +import ebooklib +from ebooklib import epub + +from .descengine import DescEngine +from .ocrengine import OCREngine +from .langengine import LangEngine + + +DEFOPTIONS = { + "withContext": True, + "withHash": True, + "multiThreaded": True, + "version": 2, +} + + +### ALTTEXT CLASSES +class AltText(ABC): + @abstractmethod + def setDescEngine(self, descEngine: DescEngine) -> bool: + """Sets current description engine. + + Args: + descEngine (DescEngine): A description engine. + + Returns: + bool: True if successful. + """ + pass + + @abstractmethod + def setOCREngine(self, ocrEngine: OCREngine) -> bool: + """Sets current OCR engine. + + Args: + ocrEngine (OCREngine): An OCR engine. + + Returns: + bool: True if successful. + """ + pass + + @abstractmethod + def setLangEngine(self, langEngine: LangEngine) -> bool: + """Sets current language engine. + + Args: + langEngine (LangEngine): A language engine. + + Returns: + bool: True if successful. + """ + pass + + @abstractmethod + def setOptions(self, options: dict) -> bool: + """Sets current options. + + Args: + options (dict): A subset of DEFOPTIONS. See DEFOPTIONS constant for possible fields. + + Returns: + bool: True if successful. + """ + pass + + @abstractmethod + def checkData(self) -> bool: + """Checks if current data exists. + + Returns: + bool: True if data exists. + + Raises: + Exception: If no data exists. + """ + pass + + # PARSING METHODS + @abstractmethod + def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: + """Parses data into a BeautifulSoup or EpubBook object. + + Args: + data (str): HTML or EPUB data. + + Returns: + bs4.BeautifulSoup | epub.EpubBook: The BeautifulSoup or EpubBook object stored in self.data. + """ + pass + + @abstractmethod + def parseFile(self, filepath: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: + """Parses data from a file into a BeautifulSoup or EpubBook object. + + Args: + filepath (str): Path to HTML or EPUB file. + + Returns: + bs4.BeautifulSoup | epub.EpubBook: The BeautifulSoup or EpubBook object stored in self.data. + """ + pass + + @abstractmethod + def getAllImgs(self) -> typing.List[bs4.element.Tag]: + """Gets all img tags. + + Returns: + typing.List[bs4.element.Tag]: A list of img tags. + """ + pass + + @abstractmethod + def getNoAltImgs(self) -> typing.List[bs4.element.Tag]: + """Gets all img tags that either do not have an alt attribute or alt.strip() is an empty string. + + Returns: + typing.List[bs4.element.Tag]: A list of img tags. + """ + pass + + @abstractmethod + def getImg(self, src: str) -> bs4.element.Tag: + """Gets an img tag given a src. + + Args: + src (str): Image source. + + Returns: + bs4.element.Tag: An img tag. + """ + pass + + @abstractmethod + def setAlt(self, src: str, text: str) -> bs4.element.Tag: + """Sets the alt of an img tag given a src. + + Args: + src (str): Image source. + text (str): New alt-text. + + Returns: + bs4.element.Tag: Newly modified img tag. + """ + pass + + @abstractmethod + def setAlts(self, associations: list[dict]) -> list[bs4.element.Tag]: + """Sets the alt of multiple img tags given a list of associations. + + Args: + associations (list[dict]): A list of associations. Must have keys "src" and "alt". + + Returns: + list[bs4.element.Tag]: A list of newly modified img tags. + """ + pass + + @abstractmethod + def export(self) -> typing.Union[str, epub.EpubBook]: + """Exports the current data. + + Returns: + str | epub.EpubBook: A string of HTML or an epub.EpubBook object. + """ + pass + + @abstractmethod + def exportToFile(self, path: str) -> str: + """Exports the current data to a file. + + Args: + path (str): A path to the file to be written. + + Returns: + str: The path to the file written. + """ + pass + + # GENERATIVE METHODS + @abstractmethod + def ingest(self) -> bool: + """Uploads the current data and to the language engine for ingestion. + This allows the language engine to reference the current data as a document. + + Returns: + bool: True if successful. + + Raises: + Exception: If no langEngine is set. + """ + pass + + @abstractmethod + def degest(self) -> bool: + """Deletes the current data from the language engine. + + Returns: + bool: True if successful. + + Raises: + Exception: If no langEngine is set. + """ + pass + + @abstractmethod + def getImgData(self, src: str) -> bytes: + """Gets byte data of an image given a src. + + Args: + src (str): Image source. + + Returns: + bytes: Image data as bytes. + """ + pass + + @abstractmethod + def getContext(self, tag: bs4.Tag) -> list[str]: + """Gets the context of an img tag. + Context being the text immediately before and after the img tag. + + Args: + tag (bs4.Tag): The img tag to get context for. + + Returns: + list[str]: A list of length 2. The first element is the text immediately before the img tag. The second element is the text immediately after the img tag. + """ + pass + + @abstractmethod + def genChars(self, imgData: bytes, src: str) -> str: + """Searches for characters in an image. + + Args: + imgData (bytes): Image data as bytes. + src (str): Source of the image. + + Returns: + str: String of characters found in the image. + """ + pass + + @abstractmethod + def genDesc(self, imgData: bytes, src: str, context: str = None) -> str: + """Generates a description of an image. + + Args: + imgData (bytes): Image data as bytes. + src (str): Source of the image. + context (str, optional): Context for an image. See getContext for more information. Defaults to None. + + Returns: + str: Description of the image. + """ + pass + + @abstractmethod + def genAltTextV1(self, src: str) -> str: + """Generates alt-text for an image given its source. + Uses V1 Dataflow model. This means the description and characters are generated and optionally refined separately. + + Args: + src (str): Source of the image. + + Returns: + str: Generated alt-text for the image. + """ + pass + + @abstractmethod + def genAltTextV2(self, src: str) -> str: + """Generates alt-text for an image given its source. + Uses V2 Dataflow model. This means the description and characters are generated and then alt-text is generated using both pieces of information. + + Args: + src (str): Source of the image. + + Returns: + str: Generated alt-text for the image. + """ + pass + + @abstractmethod + def genAltText(self, src: str) -> str: + """Generates alt-text for an image given its source and current options. + + Args: + src (str): Source of the image. + + Returns: + str: Generated alt-text for the image. + """ + pass + + @abstractmethod + def genAssociation( + self, + tag: bs4.element.Tag, + ) -> dict: + """Generates alt-text and returns an association given an img tag and current options. + + Args: + tag (bs4.element.Tag): Image tag to make an association for. + + Returns: + dict: The association. Must have keys "src" and "alt". If "withHash" is True, must also have key "hash". + """ + pass + + @abstractmethod + def _genAltAssociationsST( + self, + tags: list[bs4.element.Tag], + ) -> list[dict]: + """Generates alt-text and creates associations given a list of img tags and current options. + Single threaded implementation. + + Args: + tags (list[bs4.element.Tag]): List of img tags to make associations for. + + Returns: + list[dict]: List of associations. Must have keys "src" and "alt". If "withHash" is True, must also have key "hash". + """ + pass + + @abstractmethod + def _genAltAssociationsMT( + self, + tags: list[bs4.element.Tag], + ) -> list[dict]: + """Generates alt-text and creates associations given a list of img tags and current options. + Multi threaded implementation. + + Args: + tags (list[bs4.element.Tag]): List of img tags to make associations for. + + Returns: + list[dict]: List of associations. Must have keys "src" and "alt". If "withHash" is True, must also have key "hash". + """ + pass + + @abstractmethod + def genAltAssociations( + self, + tags: list[bs4.element.Tag], + ) -> list[dict]: + """Generates alt-text and creates associations given a list of img tags and current options. + Automatically selects mutli or single threaded implementation based on current options. + + Args: + tags (list[bs4.element.Tag]): List of img tags to make associations for. + + Returns: + list[dict]: List of associations. Must have keys "src" and "alt". If "withHash" is True, must also have key "hash". + """ + pass + + +### HELPER METHODS +def getSoup(content: str) -> bs4.BeautifulSoup: + try: + return bs4.BeautifulSoup(content, "html.parser") + except Exception as htmlErr: + try: + return bs4.BeautifulSoup(content, features="xml") + except Exception as xmlErr: + raise Exception( + f"Failed to parse the document as HTML: {htmlErr}\nFailed to parse the document as XML: {xmlErr}" + ) + + +### IMPLEMENTATIONS +class AltTextHTML(AltText): + def __init__( + self, + descEngine: DescEngine, + ocrEngine: OCREngine = None, + langEngine: LangEngine = None, + options: dict = {}, + ) -> None: + self.data = None + self.filename = None + self.filedir = None + + self.descEngine = descEngine + self.ocrEngine = ocrEngine + self.langEngine = langEngine + + self.options = DEFOPTIONS + for key in dict.keys(options): + self.options[key] = options[key] + + return None + + def setDescEngine(self, descEngine: DescEngine) -> bool: + self.descEngine = descEngine + return True + + def setOCREngine(self, ocrEngine: OCREngine) -> bool: + self.descEngine = ocrEngine + return True + + def setLangEngine(self, langEngine: LangEngine) -> bool: + self.descEngine = langEngine + return True + + def setOptions(self, options: dict) -> bool: + for key in dict.keys(options): + self.options[key] = options[key] + return True + + def checkData(self) -> bool: + if not hasattr(self, "data") or self.data == None: + raise Exception("no data set. please use .parse or .parseFile") + return True + + # PARSING METHODS + def parse(self, html: str) -> bs4.BeautifulSoup: + soup = getSoup(html) + self.data = soup + return soup + + def parseFile(self, filepath: str) -> bs4.BeautifulSoup: + with open(filepath, encoding="utf8") as html: + self.filepath = filepath + l = filepath.split("/") + self.filename = l.pop() + self.filedir = "/".join(l) + "/" + return self.parse(html) + + def getAllImgs(self) -> typing.List[bs4.element.Tag]: + self.checkData() + imgs = self.data.find_all("img") + return imgs + + def getNoAltImgs(self) -> typing.List[bs4.element.Tag]: + imgs = self.getAllImgs() + noalt = [] + for img in imgs: + if not "alt" in img.attrs.keys() or img.attrs["alt"].strip() == "": + noalt.append(img) + return noalt + + def getImg(self, src: str) -> bs4.element.Tag: + self.checkData() + img = self.data.find("img", src=src) + return img + + def setAlt(self, src: str, text: str) -> bs4.element.Tag: + self.checkData() + img = self.data.find("img", src=src) + img.attrs["alt"] = text + return img + + def setAlts(self, associations: list[dict]) -> list[bs4.element.Tag]: + self.checkData() + tags = [] + for association in associations: + tags.append(self.setAlt(association["src"], association["alt"])) + return tags + + def export(self) -> str: + self.checkData() + html = self.data.prettify() + return html + + def exportToFile(self, path: str) -> str: + html = self.export() + with open(path, "w", encoding="utf-8") as file: + file.write(html) + return path + + # GENERATIVE METHODS + def ingest(self) -> bool: + if self.langEngine == None: + raise Exception( + "To use ingest, you must have an appropriate langEngine set." + ) + with open(self.filepath, "rb") as html: + self.langEngine.ingest(self.filename, html) + return True + + def degest(self) -> bool: + if self.langEngine == None: + raise Exception( + "To use degest, you must have an appropriate langEngine set." + ) + self.langEngine.degest(self.filename) + return True + + def __getImgFilePath(self, src: str) -> str: + self.checkData() + path = f"{self.filedir}{src}" + return path + + def getImgData(self, src: str) -> bytes: + path = self.__getImgFilePath(src) + with open(path, "rb") as bin: + bin = bin.read() + return bin + + def getContext(self, tag: bs4.Tag) -> list[str]: + context = [None, None] + elem = tag + text = "" + try: + text = elem.text.strip() + while text == "": + elem = elem.previous_element + text = elem.text.strip() + context[0] = text + except: + print("error 0") + context[0] = None + elem = tag + text = "" + try: + text = elem.text.strip() + while text == "": + elem = elem.previous_element + text = elem.text.strip() + context[1] = text + except: + print("error 1") + context[1] = None + print(context) + return context + + def genChars(self, imgData: bytes, src: str) -> str: + text = self.ocrEngine.genChars(imgData, src) + return text + + def genDesc(self, imgData: bytes, src: str, context: str = None) -> str: + alt = self.descEngine.genDesc(imgData, src, context) + return alt + + def genAltTextV1(self, src: str) -> str: + imgdata = self.getImgData(src) + context = None + if self.options["withContext"]: + context = self.getContext(self.getImg(src)) + desc = self.genDesc(imgdata, src, context) + if self.langEngine != None: + chars = self.langEngine.refineDesc(desc) + + alt = f"IMAGE CAPTION: {desc}" + if self.ocrEngine != None: + chars = self.genChars(imgdata, src) + if self.langEngine != None: + chars = self.langEngine.refineOCR(chars) + alt = f"{alt}\nTEXT IN IMAGE: {chars}" + + return alt + + def genAltTextV2(self, src: str) -> str: + imgdata = self.getImgData(src) + context = [None, None] + if self.options["withContext"]: + context = self.getContext(self.getImg(src)) + desc = self.genDesc(imgdata, src, context) + + chars = "" + if self.ocrEngine != None: + chars = self.genChars(imgdata, src).strip() + + if self.langEngine == None: + raise Exception("To use version 2, you must have a langEngine set.") + + return self.langEngine.refineAlt(desc, chars, context, None) + + def genAltText(self, src: str) -> str: + if self.options["version"] == 1: + return self.genAltTextV1(src) + return self.genAltTextV2(src) + + def genAssociation( + self, + tag: bs4.element.Tag, + ) -> dict: + src = tag.attrs["src"] + alt = self.genAltText(src) + association = {"src": src, "alt": alt} + if self.options["withHash"]: + data = self.getImgData(src) + association["hash"] = hash(data) + return association + + def _genAltAssociationsST(self, tags: list[bs4.element.Tag]) -> list[dict]: + associations = [] + for tag in tags: + associations.append(self.genAssociation(tag)) + return associations + + def _genAltAssociationsMT( + self, + tags: list[bs4.element.Tag], + ) -> list[dict]: + associations = [] + + def genAppend(tag): + associations.append(self.genAssociation(tag)) + + threads: list[Thread] = [] + for tag in tags: + thread = Thread( + target=genAppend, + args=(tag,), + ) + thread.start() + threads.append(thread) + for thread in threads: + thread.join() + return associations + + def genAltAssociations( + self, + tags: list[bs4.element.Tag], + ) -> list[dict]: + if self.options["multiThreaded"]: + return self._genAltAssociationsMT(tags) + return self._genAltAssociationsST(tags) + + +class AltTextEPUB(AltText): + def __init__(self) -> None: + return None + + def checkData(self) -> bool: + if not hasattr(self, "data"): + raise Exception("no data set. please use .parse or .parseFile") + return True + + def parse(self, epub: epub.EpubBook) -> epub.EpubBook: + self.data = epub + return self.data + + def parseFile(self, filepath: str) -> epub.EpubBook: + book = epub.read_epub(filepath, {"ignore_ncx": True}) + self.data = book + return book + + def getAllImgs(self) -> typing.List[bs4.element.Tag]: + documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT) + imgs = [] + for docs in documents: + # features="xml" + soup = getSoup(docs.get_content()) + imgsInDoc = soup.find_all("img") + for img in imgsInDoc: + imgs.append(img) + return imgs + + def getNoAltImgs(self) -> typing.List[bs4.element.Tag]: + imgs = self.getAllImgs() + noalt = [] + for img in imgs: + if not "alt" in img.attrs.keys() or img.attrs["alt"].strip() == "": + noalt.append(img) + return noalt + + def setAlt(self, src: str, text: str): + self.checkData() + documents = self.data.get_items_of_type(ebooklib.ITEM_DOCUMENT) + for doc in documents: + soup = getSoup(doc.get_content()) + imgsInDoc = soup.find_all("img") + for img in imgsInDoc: + if img.attrs["src"] == src: + img.attrs["alt"] = text + newHtml = soup.prettify() + doc.set_content(newHtml.encode("utf-8")) + return + raise Exception("unable to find image with src '{src}'".format(src=src)) + + def export(self) -> epub.EpubBook: + self.checkData() + return self.data + + def exportToFile(self, path: str) -> str: + epub.write_epub(path, self.export()) + return path diff --git a/tests/automate.py b/tests/automate.py index 3be6f2f..dcd7ada 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -12,6 +12,10 @@ import importlib sys.path.append("c:/Users/ketha/Code/Senior D") #This will need to be changed system to system AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTML PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine").PrivateGPT +descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine.py' + + + # access downloaded books and go thru all of them # 1. parse html file to find img src to get the before and after context (using get context funct) @@ -24,10 +28,10 @@ class AltTextGenerator(AltTextHTML): # uses the class from alttext.py # adds relevant benchmarking and saving methods - def __init__(self): - super().__init__() + def __init__(self, api_key, descengine): + super().__init__(descengine) self.benchmark_records = [] - + self.api_key = api_key #Use genAltTextV2 #ADD benchmark time stamps def genAltTextV2(self, src: str) -> str: @@ -109,26 +113,35 @@ class AltTextGenerator(AltTextHTML): writer.writerow(record) print(f"CSV file has been generated at: {csv_file_path}") +def import_descengine(): + #Key Stuff + spec = importlib.util.spec_from_file_location("descengine", descengine_path) + descengine = importlib.util.module_from_spec(spec) + sys.modules["descengine"] = descengine + spec.loader.exec_module(descengine) + return descengine + def automate_process(extr_folder : str): # Iterate through all images in a folder to produce a table (csv) with benchmarking + descengine = import_descengine() + minigpt4_key = descengine.REPLICATE_MODELS['minigpt4'] - generator = AltTextGenerator() + generator = AltTextGenerator(minigpt4_key, descengine) # Iterate thru each book in folder (ex. downloaded_books) - for book_id in os.listdir(extr_folder): - book_path = os.path.join(extr_folder, book_id) - if os.path.isdir(book_path): + if os.path.exists(extr_folder): + for book_id in os.listdir(extr_folder): + book_path = os.path.join(extr_folder, book_id) + if os.path.isdir(book_path): + for filename in os.listdir(book_path): + filepath = os.path.join(book_path, filename) - # Iterate thru files in the book's directory - for filename in os.listdir(book_path): - filepath = os.path.join(book_path, filename) + # Check if the file is an HTML file + if filepath.endswith(".html"): - # Check if the file is an HTML file - if filepath.endswith(".html"): - - # Use the parseFile method to parse the HTML file for the genAltText function - soup = generator.parseFile(filepath) - generator.genAltText(soup) + # Use the parseFile method to parse the HTML file for the genAltText function + soup = generator.parseFile(filepath) + generator.genAltText(soup) generator.generate_csv('test_benchmark.csv', generator.benchmark_records) From 0d47c0c4f3e5285642fffaf439b727b97d66d9e6 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Wed, 20 Mar 2024 17:21:21 -0400 Subject: [PATCH 08/11] still not workin --- src/alttext/alttext.py | 14 +++++----- tests/automate.py | 59 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/alttext/alttext.py b/src/alttext/alttext.py index 65e9271..11fcce6 100644 --- a/src/alttext/alttext.py +++ b/src/alttext/alttext.py @@ -1,15 +1,15 @@ from abc import ABC, abstractmethod import typing from threading import Thread +import time import bs4 import ebooklib from ebooklib import epub - -from descengine import DescEngine -from ocrengine import OCREngine -from langengine import LangEngine +from .descengine.descengine import DescEngine +from .ocrengine.ocrengine import OCREngine +from .langengine.langengine import LangEngine DEFOPTIONS = { @@ -84,7 +84,7 @@ class AltText(ABC): # PARSING METHODS @abstractmethod - def parse(self, data: str) -> bs4.BeautifulSoup | epub.EpubBook: + def parse(self, data: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: """Parses data into a BeautifulSoup or EpubBook object. Args: @@ -96,7 +96,7 @@ class AltText(ABC): pass @abstractmethod - def parseFile(self, filepath: str) -> bs4.BeautifulSoup | epub.EpubBook: + def parseFile(self, filepath: str) -> typing.Union[bs4.BeautifulSoup, epub.EpubBook]: """Parses data from a file into a BeautifulSoup or EpubBook object. Args: @@ -163,7 +163,7 @@ class AltText(ABC): pass @abstractmethod - def export(self) -> str | epub.EpubBook: + def export(self) -> typing.Union[str, epub.EpubBook]: """Exports the current data. Returns: diff --git a/tests/automate.py b/tests/automate.py index dcd7ada..2677a19 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -11,8 +11,8 @@ from bs4 import BeautifulSoup import importlib sys.path.append("c:/Users/ketha/Code/Senior D") #This will need to be changed system to system AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTML -PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine").PrivateGPT -descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine.py' +PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine.langengine").PrivateGPT +descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine/descengine.py' @@ -24,6 +24,8 @@ descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine. # iterate thru downloaded_books folder, pass html into parseFile + + class AltTextGenerator(AltTextHTML): # uses the class from alttext.py # adds relevant benchmarking and saving methods @@ -34,18 +36,27 @@ class AltTextGenerator(AltTextHTML): self.api_key = api_key #Use genAltTextV2 #ADD benchmark time stamps - def genAltTextV2(self, src: str) -> str: + def genAltTextV2(self, src: str, book_id, image_path, book_path) -> str: # Start total timing total_start_time = time.time() + with open('example.txt', 'w', encoding="utf-8") as file: + #contents = file.read() + file.write(str(src)) + + # Image data extraction timing imgdata_start_time = time.time() + print("starting imaging") + time.sleep(3) imgdata = self.getImgData(src) imgdata_end_time = time.time() imgdata_total_time = imgdata_end_time - imgdata_start_time # Context extraction timing context = [None, None] + print("starting contexting") + time.sleep(3) context_start_time = time.time() if self.options["withContext"]: context = self.getContext(self.getImg(src)) @@ -56,12 +67,16 @@ class AltTextGenerator(AltTextHTML): # Description generation timing genDesc_start_time = time.time() + print("starting desc") + time.sleep(3) desc = self.genDesc(imgdata, src, context) genDesc_end_time = time.time() genDesc_total_time = genDesc_end_time - genDesc_start_time # OCR processing timing ocr_start_time = time.time() + print("starting ocr") + time.sleep(3) chars = "" if self.ocrEngine is not None: chars = self.genChars(imgdata, src).strip() @@ -70,6 +85,8 @@ class AltTextGenerator(AltTextHTML): # Refinement processing timing refine_start_time = time.time() + print("starting refinement") + time.sleep(3) if self.langEngine is None: raise Exception("To use version 2, you must have a langEngine set.") refined_desc = self.langEngine.refineAlt(desc, chars, context, None) @@ -80,14 +97,26 @@ class AltTextGenerator(AltTextHTML): total_end_time = time.time() total_overall_time = total_end_time - total_start_time + #Record dictionary to store all the timing data record = { - "Image Data Extraction Time": imgdata_total_time, - "Context Extraction Time": context_total_time, - "Description Generation Time": genDesc_total_time, - "OCR Processing Time": ocr_total_time, - "Refinement Processing Time": refine_total_time, - "Total Overall Time": total_overall_time + "Book": book_id, + "Image": image_path, + "Path": book_path, + "Status": True, #Set false if failed, set true is worked + "Before Context": beforeContext, + "After Context": afterContext, + "genDesc": desc, + "genDesc-Start": genDesc_start_time, + "genDesc-End": genDesc_end_time, + "genDesc-Time": genDesc_total_time, + "genOCR": chars, + "genOCR-Start": ocr_start_time, + "genOCR-End": ocr_end_time, + "genOCR-Time": ocr_total_time, + "refineDesc": refined_desc, + "refineDesc-Time": refine_total_time, + "Total Time": total_overall_time } # Add record to benchmark_records for later CSV generation self.benchmark_records.append(record) @@ -132,6 +161,7 @@ def automate_process(extr_folder : str): if os.path.exists(extr_folder): for book_id in os.listdir(extr_folder): book_path = os.path.join(extr_folder, book_id) + #alt-text/tests/downloaded_books\120 if os.path.isdir(book_path): for filename in os.listdir(book_path): filepath = os.path.join(book_path, filename) @@ -139,13 +169,20 @@ def automate_process(extr_folder : str): # Check if the file is an HTML file if filepath.endswith(".html"): + #extra layer should: add an extra layer to iterate through the images tab, + #find that image within the .html + #Go to alt-text generation where it will... + #get the context + #generate the alt-text for that image based on the context and other factors + # Use the parseFile method to parse the HTML file for the genAltText function soup = generator.parseFile(filepath) - generator.genAltText(soup) + generator.genAltTextV2(soup, book_id, filepath, book_path) + generator.generate_csv('test_benchmark.csv', generator.benchmark_records) if __name__ == "__main__": print("Running automate.py") - automate_process('downloaded_books') \ No newline at end of file + automate_process('alt-text/tests/downloaded_books') \ No newline at end of file From f206b557a2a2bad437de789105497bc4d5d3d900 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Wed, 20 Mar 2024 22:30:10 -0400 Subject: [PATCH 09/11] more stuff --- tests/automate.py | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/tests/automate.py b/tests/automate.py index 2677a19..00e0664 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -14,18 +14,12 @@ AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTM PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine.langengine").PrivateGPT descengine_path = 'c:/Users/ketha/Code/Senior D/alt-text/src/alttext/descengine/descengine.py' - - - # access downloaded books and go thru all of them # 1. parse html file to find img src to get the before and after context (using get context funct) # 2. generate alt text using genAltTextV2 (add benchmarking at some point) # 3. save alt text and benchmarking in a csv (see csv file headings) - # iterate thru downloaded_books folder, pass html into parseFile - - class AltTextGenerator(AltTextHTML): # uses the class from alttext.py # adds relevant benchmarking and saving methods @@ -37,18 +31,13 @@ class AltTextGenerator(AltTextHTML): #Use genAltTextV2 #ADD benchmark time stamps def genAltTextV2(self, src: str, book_id, image_path, book_path) -> str: + status = False # Start total timing total_start_time = time.time() - with open('example.txt', 'w', encoding="utf-8") as file: - #contents = file.read() - file.write(str(src)) - - # Image data extraction timing imgdata_start_time = time.time() print("starting imaging") - time.sleep(3) imgdata = self.getImgData(src) imgdata_end_time = time.time() imgdata_total_time = imgdata_end_time - imgdata_start_time @@ -56,7 +45,6 @@ class AltTextGenerator(AltTextHTML): # Context extraction timing context = [None, None] print("starting contexting") - time.sleep(3) context_start_time = time.time() if self.options["withContext"]: context = self.getContext(self.getImg(src)) @@ -68,7 +56,6 @@ class AltTextGenerator(AltTextHTML): # Description generation timing genDesc_start_time = time.time() print("starting desc") - time.sleep(3) desc = self.genDesc(imgdata, src, context) genDesc_end_time = time.time() genDesc_total_time = genDesc_end_time - genDesc_start_time @@ -76,7 +63,6 @@ class AltTextGenerator(AltTextHTML): # OCR processing timing ocr_start_time = time.time() print("starting ocr") - time.sleep(3) chars = "" if self.ocrEngine is not None: chars = self.genChars(imgdata, src).strip() @@ -103,7 +89,7 @@ class AltTextGenerator(AltTextHTML): "Book": book_id, "Image": image_path, "Path": book_path, - "Status": True, #Set false if failed, set true is worked + "Status": status, #Set false if failed, set true is worked "Before Context": beforeContext, "After Context": afterContext, "genDesc": desc, @@ -119,6 +105,7 @@ class AltTextGenerator(AltTextHTML): "Total Time": total_overall_time } # Add record to benchmark_records for later CSV generation + status = True self.benchmark_records.append(record) return refined_desc @@ -165,20 +152,27 @@ def automate_process(extr_folder : str): if os.path.isdir(book_path): for filename in os.listdir(book_path): filepath = os.path.join(book_path, filename) + htmlpath = filepath #This will be how we go back and reference the html that needs to context # Check if the file is an HTML file - if filepath.endswith(".html"): + if htmlpath.endswith(".html"): + soup = generator.parseFile(htmlpath) - #extra layer should: add an extra layer to iterate through the images tab, - #find that image within the .html - #Go to alt-text generation where it will... - #get the context - #generate the alt-text for that image based on the context and other factors + image_path = os.path.join(book_path, 'images') + if os.path.exists(image_path): + for image in os.listdir(image_path): + filepath = os.path.join('images', image) + + if filepath.endswith('.jpg'): + + generator.genAltTextV2(soup, book_id, filepath, book_path) # Use the parseFile method to parse the HTML file for the genAltText function - soup = generator.parseFile(filepath) - generator.genAltTextV2(soup, book_id, filepath, book_path) - + #extra layer should: add an extra layer to iterate through the images tab, + #find that image within the .html + #Go to alt-text generation where it will... + #get the context + #generate the alt-text for that image based on the context and other factors generator.generate_csv('test_benchmark.csv', generator.benchmark_records) From bdac53a646c1f98e8d8b6b65650a8312b1733e82 Mon Sep 17 00:00:00 2001 From: kethan351 Date: Wed, 20 Mar 2024 22:31:45 -0400 Subject: [PATCH 10/11] to dos --- tests/automate.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/automate.py b/tests/automate.py index 00e0664..5f6fd68 100644 --- a/tests/automate.py +++ b/tests/automate.py @@ -9,6 +9,7 @@ import csv import bs4 from bs4 import BeautifulSoup import importlib +#TODO Change the sys path to fit your system sys.path.append("c:/Users/ketha/Code/Senior D") #This will need to be changed system to system AltTextHTML = importlib.import_module("alt-text.src.alttext.alttext").AltTextHTML PrivateGPT = importlib.import_module("alt-text.src.alttext.langengine.langengine").PrivateGPT @@ -167,12 +168,13 @@ def automate_process(extr_folder : str): generator.genAltTextV2(soup, book_id, filepath, book_path) - # Use the parseFile method to parse the HTML file for the genAltText function - #extra layer should: add an extra layer to iterate through the images tab, - #find that image within the .html - #Go to alt-text generation where it will... - #get the context - #generate the alt-text for that image based on the context and other factors + #TODO read below, I don't know if this functionality is built in already + # Use the parseFile method to parse the HTML file for the genAltText function + #extra layer should: add an extra layer to iterate through the images tab, + #find that image within the .html + #Go to alt-text generation where it will... + #get the context + #generate the alt-text for that image based on the context and other factors generator.generate_csv('test_benchmark.csv', generator.benchmark_records)