import os
import re
from bs4 import BeautifulSoup
from ebooklib import epub
# Directory containing the HTML files
directory = '/path/to/your/html/files' # Change this to your directory
# Function to extract date from filename and convert to sortable format
def extract_date(filename):
match = re.match(r'(\d{2})-(\w{3})-(\d{4})\.html', filename)
if match:
day, month, year = match.groups()
month_num = {
"JAN": "01", "FEB": "02", "MAR": "03", "APR": "04",
"MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08",
"SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12"
}[month.upper()]
return f"{year}-{month_num}-{day}"
return None
# Get list of HTML files sorted by date
html_files = sorted(
[f for f in os.listdir(directory) if f.endswith('.html')],
key=lambda x: extract_date(x)
)
# Create an EPUB book
book = epub.EpubBook()
book.set_identifier('id123456')
book.set_title('Elite Dangerous Galnet Articles')
book.set_language('en')
# Function to extract article content from an HTML file
def extract_article_content(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
article_divs = soup.find_all('div', class_='article')
if not article_divs:
raise ValueError(f"Unable to find any article divs in the file: {filepath}")
return [div.prettify() for div in article_divs]
# Add each article as a chapter to the EPUB book
chapter_index = 1
for filename in html_files:
filepath = os.path.join(directory, filename)
try:
articles_content = extract_article_content(filepath)
for article_content in articles_content:
chapter = epub.EpubHtml(title=f'Chapter {chapter_index}', file_name=f'chap_{chapter_index}.xhtml', lang='en')
chapter.content = article_content
book.add_item(chapter)
book.toc.append(chapter)
book.spine.append(chapter)
chapter_index += 1
except Exception as e:
print(f"Error processing {filename}: {e}")
# Add default NCX and Nav files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Save the EPUB file
epub_file = 'Elite_Dangerous_Galnet_Articles.epub'
epub.write_epub(epub_file, book, {})
print(f"EPUB file created: {epub_file}")