alt-text/tests/collect.py

84 lines
2.6 KiB
Python

import random
import requests
import bs4
import time
import os
def extractImage(imgs: list[bs4.element.Tag]) -> list[bs4.element.Tag]:
if len(imgs) == 0:
return None
index = random.randint(0, len(imgs) - 1)
img = imgs[index]
if img.has_attr("alt") and img.attrs["alt"].strip() != "":
return img
return extractImage(imgs[:index] + imgs[index + 1 :])
def collect(
num: int, image_output: str = "images.txt", alt_output: str = "alts.txt"
) -> int:
"""
Collect images with alt-text from random ebooks
Args:
num (int): Number of images to collect.
image_output (str, optional): Path to output image URLs. Defaults to "images.txt".
alt_output (str, optional): Path to output alt-text. Defaults to "alts.txt".
"""
count = 0
while count < num:
time.sleep(0.5)
bookid = random.randint(1, 70000)
bookurl = f"https://gutenberg.org/cache/epub/{bookid}/pg{bookid}-images.html"
response = requests.get(bookurl)
if response.status_code != 200:
print(f"Failed to fetch book {bookid}.")
continue
soup = bs4.BeautifulSoup(response.text, "html.parser")
div = soup.find("div", id="pg-machine-header")
if not div:
print(f"No 'pg-machine-header' found in book {bookid}.")
continue
languageP = div.find_all(recursive=False)[3]
if languageP.text.strip() != "Language: English":
print(f"Book {bookid} is not in English.")
continue
imgs: list[bs4.element.Tag] = soup.find_all("img")
img = extractImage(imgs)
if img is None:
print(
f"Out of {len(imgs)} images, no images with alt-text found in book {bookid}."
)
continue
with open(image_output, "a") as imagefile:
imagefile.write(f"{bookid} cache/epub/{bookid}/{img['src']}\n")
with open(alt_output, "a") as altfile:
altfile.write(f"{img['alt'].encode('ascii', 'ignore').decode()}\n")
count += 1
return True
def split(input_file, book_output, image_output):
with open(input_file, "r") as file:
for line in file:
book_number = line.split()[0] # Extracting book number
image = line.split()[1] # Extracting image
with open(book_output, "a") as output_file:
output_file.write(f"{book_number}\n")
with open(image_output, "a") as output_file:
output_file.write(f"{image}\n")
if __name__ == "__main__":
# collect(150)
split("images.txt", "books.txt", "images2.txt")