News-Hooks-TheRedTeam/newsbot.py

import feedparser
import requests
import time
from datetime import datetime, timedelta
from dateutil import parser
from typing import List, Dict
from bs4 import BeautifulSoup
import logging
import os
import re
import schedule

# Webhook URLs
DEFAULT_WEBHOOK_URL "WEBHOOK" = # Webhook for feeds.txt
ADVISORES_WEBHOOK = "WEBHOOK"  # Webhook for threatintel.txt
ALERTS_WEBHOOK =  "WEBHOOK"
BUTBOUNTY_WEBHOOK = WEBHOOK"
# File paths
PROCESSED_LINKS_FILE = 'processed_links.txt'  # File to store processed links
ADVISORES = 'feeds/advisories.txt'  # File for threat intel feeds (common webhook)
FEEDS_FILE = 'feeds/feeds.txt'  # File for regular feeds (different webhook)
ALERTS = "feeds/alerts.txt"
BUGBOUNTY = "feeds/bugbounty.txt"

# Set up logging
logging.basicConfig(filename='rss_feed_watcher.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Helper function to send an embed message to Discord
def send_discord_embed(title: str, link: str, published: datetime, description: str, webhook_url: str):
    embed = {
        "embeds": [{
            "title": title,
            "url": link,
            "description": description,
            "color": 5814783,  # Hex color code for the embed (optional)
            "timestamp": published.isoformat()  # ISO format timestamp
        }]
    }
    try:
        response = requests.post(webhook_url, json=embed)
        if response.status_code != 204:
            logging.error(f"Failed to send message to Discord: {response.status_code} {response.text}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Exception while sending message to Discord: {e}")

# Helper function to clean HTML and extract plain text
def clean_html(html_content: str) -> str:
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator='\n').strip()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'(read more|continue reading|more details|[ \t]*\n[ \t]*\n[ \t]*)', '', text, flags=re.IGNORECASE)
    return text.strip()

# Helper function to get posts from RSS feed
def get_recent_posts(feed_url: str, webhook_url: str, since_time: datetime, processed_links: set) -> List[dict]:
    try:
        response = requests.get(feed_url, timeout=10)
        response.raise_for_status()  # Raise an error for bad HTTP responses
        feed = feedparser.parse(response.content)
        recent_posts = []

        for entry in feed.entries:
            # Try to get and parse the published date from the entry
            published = getattr(entry, 'published', None)
            if published:
                try:
                    published_time = parser.parse(published).replace(tzinfo=None)
                except (ValueError, TypeError):
                    logging.warning(f"Skipping entry with invalid published date: {published}")
                    continue
            else:
                logging.warning("Skipping entry without published date")
                continue

            if published_time > since_time:
                link = entry.link
                if link in processed_links:
                    continue  # Skip already processed posts

                description = getattr(entry, 'description', '')
                plain_text_description = clean_html(description)

                recent_posts.append({
                    'title': entry.title,
                    'link': link,
                    'published': published_time,
                    'description': plain_text_description
                })

                # Add link to the set of processed links
                processed_links.add(link)

                # Send an embed message to Discord for each new post
                send_discord_embed(entry.title, link, published_time, plain_text_description, webhook_url)

        return recent_posts
    except requests.exceptions.Timeout as e:
        logging.error(f"Request timeout for {feed_url}: {e}")
    except requests.exceptions.ConnectionError as e:
        logging.error(f"Connection error for {feed_url}: {e}")
    except requests.exceptions.RequestException as e:
        logging.error(f"HTTP request failed for {feed_url}: {e}")

# Load processed links from file
def load_processed_links() -> set:
    if os.path.exists(PROCESSED_LINKS_FILE):
        with open(PROCESSED_LINKS_FILE, 'r') as file:
            return set(line.strip() for line in file)
    return set()

# Save processed links to file
def save_processed_links(processed_links: set):
    with open(PROCESSED_LINKS_FILE, 'w') as file:
        for link in processed_links:
            file.write(f"{link}\n")

# Helper function to load URLs from a file (feed or threat intel)
def load_feed_urls(file_path: str) -> List[str]:
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return [line.strip() for line in file if line.strip()]
    logging.error(f"{file_path} not found.")
    return []

# Function to process threat intel feeds
def process_threatintel_feeds(threatintel_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in threatintel_feeds:
        logging.info(f"Checking threat intel feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, ADVISORES_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New threat intel post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

# Function to process regular feeds
def process_regular_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, DEFAULT_WEBHOOK_URL, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

def process_alert_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, ALERTS_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

def process_bugbounty_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, BUTBOUNTY_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

# Main function to run the watcher
def rss_feed_watcher():
    print("RUNNING...")
    processed_links = load_processed_links()  # Load previously processed links
    regular_feeds = load_feed_urls(FEEDS_FILE)
    # Load threat intel and regular feeds
    threatintel_feeds = load_feed_urls(ADVISORES)
    bug_bounty_feeds = load_feed_urls(BUGBOUNTY)
    alert_feeds = load_feed_urls(ALERTS)

    # Get the timestamp to compare recent posts (last 30 minutes)
    since_time = datetime.now() - timedelta(hours=12)
    since_time = since_time.replace(tzinfo=None)
    print("going over bug bounties...")
    process_bugbounty_feeds(bug_bounty_feeds,since_time, processed_links)
    print("going over regular feeds...")
    process_regular_feeds(regular_feeds, since_time, processed_links)
    # Process threat intel feeds
    print("going over threat intel...")
    process_threatintel_feeds(threatintel_feeds, since_time, processed_links)
    print("going over alerts...")
    process_alert_feeds(alert_feeds, since_time, processed_links)

    # Save updated processed links
    save_processed_links(processed_links)

# Schedule the RSS feed watcher to run every 30 minutes
def schedule_rss_watcher():
    schedule.every(1).hours.do(rss_feed_watcher)
    logging.info("RSS Feed Watcher scheduled to run every 30 minutes.")

    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    rss_feed_watcher()
    # Start the scheduled watcher
    schedule_rss_watcher()