import feedparser import requests import time from datetime import datetime, timedelta from dateutil import parser from typing import List, Dict from bs4 import BeautifulSoup import logging import os import re import schedule # Webhook URLs DEFAULT_WEBHOOK_URL "WEBHOOK" = # Webhook for feeds.txt ADVISORES_WEBHOOK = "WEBHOOK" # Webhook for threatintel.txt ALERTS_WEBHOOK = "WEBHOOK" BUTBOUNTY_WEBHOOK = WEBHOOK" # File paths PROCESSED_LINKS_FILE = 'processed_links.txt' # File to store processed links ADVISORES = 'feeds/advisories.txt' # File for threat intel feeds (common webhook) FEEDS_FILE = 'feeds/feeds.txt' # File for regular feeds (different webhook) ALERTS = "feeds/alerts.txt" BUGBOUNTY = "feeds/bugbounty.txt" # Set up logging logging.basicConfig(filename='rss_feed_watcher.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Helper function to send an embed message to Discord def send_discord_embed(title: str, link: str, published: datetime, description: str, webhook_url: str): embed = { "embeds": [{ "title": title, "url": link, "description": description, "color": 5814783, # Hex color code for the embed (optional) "timestamp": published.isoformat() # ISO format timestamp }] } try: response = requests.post(webhook_url, json=embed) if response.status_code != 204: logging.error(f"Failed to send message to Discord: {response.status_code} {response.text}") except requests.exceptions.RequestException as e: logging.error(f"Exception while sending message to Discord: {e}") # Helper function to clean HTML and extract plain text def clean_html(html_content: str) -> str: soup = BeautifulSoup(html_content, 'html.parser') text = soup.get_text(separator='\n').strip() text = re.sub(r'\n+', ' ', text) text = re.sub(r'(read more|continue reading|more details|[ \t]*\n[ \t]*\n[ \t]*)', '', text, flags=re.IGNORECASE) return text.strip() # Helper function to get posts from RSS feed def get_recent_posts(feed_url: str, webhook_url: str, since_time: datetime, processed_links: set) -> List[dict]: try: response = requests.get(feed_url, timeout=10) response.raise_for_status() # Raise an error for bad HTTP responses feed = feedparser.parse(response.content) recent_posts = [] for entry in feed.entries: # Try to get and parse the published date from the entry published = getattr(entry, 'published', None) if published: try: published_time = parser.parse(published).replace(tzinfo=None) except (ValueError, TypeError): logging.warning(f"Skipping entry with invalid published date: {published}") continue else: logging.warning("Skipping entry without published date") continue if published_time > since_time: link = entry.link if link in processed_links: continue # Skip already processed posts description = getattr(entry, 'description', '') plain_text_description = clean_html(description) recent_posts.append({ 'title': entry.title, 'link': link, 'published': published_time, 'description': plain_text_description }) # Add link to the set of processed links processed_links.add(link) # Send an embed message to Discord for each new post send_discord_embed(entry.title, link, published_time, plain_text_description, webhook_url) return recent_posts except requests.exceptions.Timeout as e: logging.error(f"Request timeout for {feed_url}: {e}") except requests.exceptions.ConnectionError as e: logging.error(f"Connection error for {feed_url}: {e}") except requests.exceptions.RequestException as e: logging.error(f"HTTP request failed for {feed_url}: {e}") # Load processed links from file def load_processed_links() -> set: if os.path.exists(PROCESSED_LINKS_FILE): with open(PROCESSED_LINKS_FILE, 'r') as file: return set(line.strip() for line in file) return set() # Save processed links to file def save_processed_links(processed_links: set): with open(PROCESSED_LINKS_FILE, 'w') as file: for link in processed_links: file.write(f"{link}\n") # Helper function to load URLs from a file (feed or threat intel) def load_feed_urls(file_path: str) -> List[str]: if os.path.exists(file_path): with open(file_path, 'r') as file: return [line.strip() for line in file if line.strip()] logging.error(f"{file_path} not found.") return [] # Function to process threat intel feeds def process_threatintel_feeds(threatintel_feeds: List[str], since_time: datetime, processed_links: set): for feed_url in threatintel_feeds: logging.info(f"Checking threat intel feed: {feed_url}") recent_posts = get_recent_posts(feed_url, ADVISORES_WEBHOOK, since_time, processed_links) try: for post in recent_posts: logging.info(f"New threat intel post: {post['title']}") logging.info(f"Link: {post['link']}") logging.info(f"Published: {post['published']}") logging.info(f"Description: {post['description']}") logging.info("-" * 40) except TypeError: pass # Function to process regular feeds def process_regular_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set): for feed_url in regular_feeds: logging.info(f"Checking regular feed: {feed_url}") recent_posts = get_recent_posts(feed_url, DEFAULT_WEBHOOK_URL, since_time, processed_links) try: for post in recent_posts: logging.info(f"New regular post: {post['title']}") logging.info(f"Link: {post['link']}") logging.info(f"Published: {post['published']}") logging.info(f"Description: {post['description']}") logging.info("-" * 40) except TypeError: pass def process_alert_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set): for feed_url in regular_feeds: logging.info(f"Checking regular feed: {feed_url}") recent_posts = get_recent_posts(feed_url, ALERTS_WEBHOOK, since_time, processed_links) try: for post in recent_posts: logging.info(f"New regular post: {post['title']}") logging.info(f"Link: {post['link']}") logging.info(f"Published: {post['published']}") logging.info(f"Description: {post['description']}") logging.info("-" * 40) except TypeError: pass def process_bugbounty_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set): for feed_url in regular_feeds: logging.info(f"Checking regular feed: {feed_url}") recent_posts = get_recent_posts(feed_url, BUTBOUNTY_WEBHOOK, since_time, processed_links) try: for post in recent_posts: logging.info(f"New regular post: {post['title']}") logging.info(f"Link: {post['link']}") logging.info(f"Published: {post['published']}") logging.info(f"Description: {post['description']}") logging.info("-" * 40) except TypeError: pass # Main function to run the watcher def rss_feed_watcher(): print("RUNNING...") processed_links = load_processed_links() # Load previously processed links regular_feeds = load_feed_urls(FEEDS_FILE) # Load threat intel and regular feeds threatintel_feeds = load_feed_urls(ADVISORES) bug_bounty_feeds = load_feed_urls(BUGBOUNTY) alert_feeds = load_feed_urls(ALERTS) # Get the timestamp to compare recent posts (last 30 minutes) since_time = datetime.now() - timedelta(hours=12) since_time = since_time.replace(tzinfo=None) print("going over bug bounties...") process_bugbounty_feeds(bug_bounty_feeds,since_time, processed_links) print("going over regular feeds...") process_regular_feeds(regular_feeds, since_time, processed_links) # Process threat intel feeds print("going over threat intel...") process_threatintel_feeds(threatintel_feeds, since_time, processed_links) print("going over alerts...") process_alert_feeds(alert_feeds, since_time, processed_links) # Save updated processed links save_processed_links(processed_links) # Schedule the RSS feed watcher to run every 30 minutes def schedule_rss_watcher(): schedule.every(1).hours.do(rss_feed_watcher) logging.info("RSS Feed Watcher scheduled to run every 30 minutes.") while True: schedule.run_pending() time.sleep(1) if __name__ == "__main__": rss_feed_watcher() # Start the scheduled watcher schedule_rss_watcher()