News-Hooks-TheRedTeam/newsbot.py

import feedparser
import requests
import time
from datetime import datetime, timedelta
from dateutil import parser
from typing import List, Dict
from bs4 import BeautifulSoup
import logging
import os
import re
import schedule

# Webhook URLs
DEFAULT_WEBHOOK_URL "WEBHOOK" = # Webhook for feeds.txt
ADVISORES_WEBHOOK = "WEBHOOK"  # Webhook for threatintel.txt
ALERTS_WEBHOOK =  "WEBHOOK"
BUTBOUNTY_WEBHOOK = WEBHOOK"
# File paths
PROCESSED_LINKS_FILE = 'processed_links.txt'  # File to store processed links
ADVISORES = 'feeds/advisories.txt'  # File for threat intel feeds (common webhook)
FEEDS_FILE = 'feeds/feeds.txt'  # File for regular feeds (different webhook)
ALERTS = "feeds/alerts.txt"
BUGBOUNTY = "feeds/bugbounty.txt"

# Set up logging
logging.basicConfig(filename='rss_feed_watcher.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Helper function to send an embed message to Discord
def send_discord_embed(title: str, link: str, published: datetime, description: str, webhook_url: str):
    embed = {
        "embeds": [{
            "title": title,
            "url": link,
            "description": description,
            "color": 5814783,  # Hex color code for the embed (optional)
            "timestamp": published.isoformat()  # ISO format timestamp
        }]
    }
    try:
        response = requests.post(webhook_url, json=embed)
        if response.status_code != 204:
            logging.error(f"Failed to send message to Discord: {response.status_code} {response.text}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Exception while sending message to Discord: {e}")

# Helper function to clean HTML and extract plain text
def clean_html(html_content: str) -> str:
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator='\n').strip()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'(read more|continue reading|more details|[ \t]*\n[ \t]*\n[ \t]*)', '', text, flags=re.IGNORECASE)
    return text.strip()

# Helper function to get posts from RSS feed
def get_recent_posts(feed_url: str, webhook_url: str, since_time: datetime, processed_links: set) -> List[dict]:
    try:
        response = requests.get(feed_url, timeout=10)
        response.raise_for_status()  # Raise an error for bad HTTP responses
        feed = feedparser.parse(response.content)
        recent_posts = []

        for entry in feed.entries:
            # Try to get and parse the published date from the entry
            published = getattr(entry, 'published', None)
            if published:
                try:
                    published_time = parser.parse(published).replace(tzinfo=None)
                except (ValueError, TypeError):
                    logging.warning(f"Skipping entry with invalid published date: {published}")
                    continue
            else:
                logging.warning("Skipping entry without published date")
                continue

            if published_time > since_time:
                link = entry.link
                if link in processed_links:
                    continue  # Skip already processed posts

                description = getattr(entry, 'description', '')
                plain_text_description = clean_html(description)

                recent_posts.append({
                    'title': entry.title,
                    'link': link,
                    'published': published_time,
                    'description': plain_text_description
                })

                # Add link to the set of processed links
                processed_links.add(link)

                # Send an embed message to Discord for each new post
                send_discord_embed(entry.title, link, published_time, plain_text_description, webhook_url)
                
        return recent_posts
    except requests.exceptions.Timeout as e:
        logging.error(f"Request timeout for {feed_url}: {e}")
    except requests.exceptions.ConnectionError as e:
        logging.error(f"Connection error for {feed_url}: {e}")
    except requests.exceptions.RequestException as e:
        logging.error(f"HTTP request failed for {feed_url}: {e}")

# Load processed links from file
def load_processed_links() -> set:
    if os.path.exists(PROCESSED_LINKS_FILE):
        with open(PROCESSED_LINKS_FILE, 'r') as file:
            return set(line.strip() for line in file)
    return set()

# Save processed links to file
def save_processed_links(processed_links: set):
    with open(PROCESSED_LINKS_FILE, 'w') as file:
        for link in processed_links:
            file.write(f"{link}\n")

# Helper function to load URLs from a file (feed or threat intel)
def load_feed_urls(file_path: str) -> List[str]:
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return [line.strip() for line in file if line.strip()]
    logging.error(f"{file_path} not found.")
    return []

# Function to process threat intel feeds
def process_threatintel_feeds(threatintel_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in threatintel_feeds:
        logging.info(f"Checking threat intel feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, ADVISORES_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New threat intel post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

# Function to process regular feeds
def process_regular_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, DEFAULT_WEBHOOK_URL, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass
        
def process_alert_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, ALERTS_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass
        
def process_bugbounty_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):
    for feed_url in regular_feeds:
        logging.info(f"Checking regular feed: {feed_url}")
        recent_posts = get_recent_posts(feed_url, BUTBOUNTY_WEBHOOK, since_time, processed_links)
        try:
            for post in recent_posts:
                logging.info(f"New regular post: {post['title']}")
                logging.info(f"Link: {post['link']}")
                logging.info(f"Published: {post['published']}")
                logging.info(f"Description: {post['description']}")
                logging.info("-" * 40)
        except TypeError:
            pass

# Main function to run the watcher
def rss_feed_watcher():
    print("RUNNING...")
    processed_links = load_processed_links()  # Load previously processed links
    regular_feeds = load_feed_urls(FEEDS_FILE)
    # Load threat intel and regular feeds
    threatintel_feeds = load_feed_urls(ADVISORES)
    bug_bounty_feeds = load_feed_urls(BUGBOUNTY)
    alert_feeds = load_feed_urls(ALERTS)

    # Get the timestamp to compare recent posts (last 30 minutes)
    since_time = datetime.now() - timedelta(hours=12)
    since_time = since_time.replace(tzinfo=None)
    print("going over bug bounties...")
    process_bugbounty_feeds(bug_bounty_feeds,since_time, processed_links)
    print("going over regular feeds...")
    process_regular_feeds(regular_feeds, since_time, processed_links)
    # Process threat intel feeds
    print("going over threat intel...")
    process_threatintel_feeds(threatintel_feeds, since_time, processed_links)
    print("going over alerts...")
    process_alert_feeds(alert_feeds, since_time, processed_links)

    # Save updated processed links
    save_processed_links(processed_links)

# Schedule the RSS feed watcher to run every 30 minutes
def schedule_rss_watcher():
    schedule.every(1).hours.do(rss_feed_watcher)
    logging.info("RSS Feed Watcher scheduled to run every 30 minutes.")
    
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    rss_feed_watcher()
    # Start the scheduled watcher
    schedule_rss_watcher()
initial commit 2024-12-15 13:59:45 +00:00			`import feedparser`
			`import requests`
			`import time`
			`from datetime import datetime, timedelta`
			`from dateutil import parser`
			`from typing import List, Dict`
			`from bs4 import BeautifulSoup`
			`import logging`
			`import os`
			`import re`
			`import schedule`

			`# Webhook URLs`
			`DEFAULT_WEBHOOK_URL "WEBHOOK" = # Webhook for feeds.txt`
			`ADVISORES_WEBHOOK = "WEBHOOK" # Webhook for threatintel.txt`
			`ALERTS_WEBHOOK = "WEBHOOK"`
			`BUTBOUNTY_WEBHOOK = WEBHOOK"`
			`# File paths`
			`PROCESSED_LINKS_FILE = 'processed_links.txt' # File to store processed links`
			`ADVISORES = 'feeds/advisories.txt' # File for threat intel feeds (common webhook)`
			`FEEDS_FILE = 'feeds/feeds.txt' # File for regular feeds (different webhook)`
			`ALERTS = "feeds/alerts.txt"`
			`BUGBOUNTY = "feeds/bugbounty.txt"`

			`# Set up logging`
			`logging.basicConfig(filename='rss_feed_watcher.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')`

			`# Helper function to send an embed message to Discord`
			`def send_discord_embed(title: str, link: str, published: datetime, description: str, webhook_url: str):`
			`embed = {`
			`"embeds": [{`
			`"title": title,`
			`"url": link,`
			`"description": description,`
			`"color": 5814783, # Hex color code for the embed (optional)`
			`"timestamp": published.isoformat() # ISO format timestamp`
			`}]`
			`}`
			`try:`
			`response = requests.post(webhook_url, json=embed)`
			`if response.status_code != 204:`
			`logging.error(f"Failed to send message to Discord: {response.status_code} {response.text}")`
			`except requests.exceptions.RequestException as e:`
			`logging.error(f"Exception while sending message to Discord: {e}")`

			`# Helper function to clean HTML and extract plain text`
			`def clean_html(html_content: str) -> str:`
			`soup = BeautifulSoup(html_content, 'html.parser')`
			`text = soup.get_text(separator='\n').strip()`
			`text = re.sub(r'\n+', ' ', text)`
			`text = re.sub(r'(read more\|continue reading\|more details\|[ \t]\n[ \t]\n[ \t]*)', '', text, flags=re.IGNORECASE)`
			`return text.strip()`

			`# Helper function to get posts from RSS feed`
			`def get_recent_posts(feed_url: str, webhook_url: str, since_time: datetime, processed_links: set) -> List[dict]:`
			`try:`
			`response = requests.get(feed_url, timeout=10)`
			`response.raise_for_status() # Raise an error for bad HTTP responses`
			`feed = feedparser.parse(response.content)`
			`recent_posts = []`

			`for entry in feed.entries:`
			`# Try to get and parse the published date from the entry`
			`published = getattr(entry, 'published', None)`
			`if published:`
			`try:`
			`published_time = parser.parse(published).replace(tzinfo=None)`
			`except (ValueError, TypeError):`
			`logging.warning(f"Skipping entry with invalid published date: {published}")`
			`continue`
			`else:`
			`logging.warning("Skipping entry without published date")`
			`continue`

			`if published_time > since_time:`
			`link = entry.link`
			`if link in processed_links:`
			`continue # Skip already processed posts`

			`description = getattr(entry, 'description', '')`
			`plain_text_description = clean_html(description)`

			`recent_posts.append({`
			`'title': entry.title,`
			`'link': link,`
			`'published': published_time,`
			`'description': plain_text_description`
			`})`

			`# Add link to the set of processed links`
			`processed_links.add(link)`

			`# Send an embed message to Discord for each new post`
			`send_discord_embed(entry.title, link, published_time, plain_text_description, webhook_url)`

			`return recent_posts`
			`except requests.exceptions.Timeout as e:`
			`logging.error(f"Request timeout for {feed_url}: {e}")`
			`except requests.exceptions.ConnectionError as e:`
			`logging.error(f"Connection error for {feed_url}: {e}")`
			`except requests.exceptions.RequestException as e:`
			`logging.error(f"HTTP request failed for {feed_url}: {e}")`

			`# Load processed links from file`
			`def load_processed_links() -> set:`
			`if os.path.exists(PROCESSED_LINKS_FILE):`
			`with open(PROCESSED_LINKS_FILE, 'r') as file:`
			`return set(line.strip() for line in file)`
			`return set()`

			`# Save processed links to file`
			`def save_processed_links(processed_links: set):`
			`with open(PROCESSED_LINKS_FILE, 'w') as file:`
			`for link in processed_links:`
			`file.write(f"{link}\n")`

			`# Helper function to load URLs from a file (feed or threat intel)`
			`def load_feed_urls(file_path: str) -> List[str]:`
			`if os.path.exists(file_path):`
			`with open(file_path, 'r') as file:`
			`return [line.strip() for line in file if line.strip()]`
			`logging.error(f"{file_path} not found.")`
			`return []`

			`# Function to process threat intel feeds`
			`def process_threatintel_feeds(threatintel_feeds: List[str], since_time: datetime, processed_links: set):`
			`for feed_url in threatintel_feeds:`
			`logging.info(f"Checking threat intel feed: {feed_url}")`
			`recent_posts = get_recent_posts(feed_url, ADVISORES_WEBHOOK, since_time, processed_links)`
			`try:`
			`for post in recent_posts:`
			`logging.info(f"New threat intel post: {post['title']}")`
			`logging.info(f"Link: {post['link']}")`
			`logging.info(f"Published: {post['published']}")`
			`logging.info(f"Description: {post['description']}")`
			`logging.info("-" * 40)`
			`except TypeError:`
			`pass`

			`# Function to process regular feeds`
			`def process_regular_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):`
			`for feed_url in regular_feeds:`
			`logging.info(f"Checking regular feed: {feed_url}")`
			`recent_posts = get_recent_posts(feed_url, DEFAULT_WEBHOOK_URL, since_time, processed_links)`
			`try:`
			`for post in recent_posts:`
			`logging.info(f"New regular post: {post['title']}")`
			`logging.info(f"Link: {post['link']}")`
			`logging.info(f"Published: {post['published']}")`
			`logging.info(f"Description: {post['description']}")`
			`logging.info("-" * 40)`
			`except TypeError:`
			`pass`

			`def process_alert_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):`
			`for feed_url in regular_feeds:`
			`logging.info(f"Checking regular feed: {feed_url}")`
			`recent_posts = get_recent_posts(feed_url, ALERTS_WEBHOOK, since_time, processed_links)`
			`try:`
			`for post in recent_posts:`
			`logging.info(f"New regular post: {post['title']}")`
			`logging.info(f"Link: {post['link']}")`
			`logging.info(f"Published: {post['published']}")`
			`logging.info(f"Description: {post['description']}")`
			`logging.info("-" * 40)`
			`except TypeError:`
			`pass`

			`def process_bugbounty_feeds(regular_feeds: List[str], since_time: datetime, processed_links: set):`
			`for feed_url in regular_feeds:`
			`logging.info(f"Checking regular feed: {feed_url}")`
			`recent_posts = get_recent_posts(feed_url, BUTBOUNTY_WEBHOOK, since_time, processed_links)`
			`try:`
			`for post in recent_posts:`
			`logging.info(f"New regular post: {post['title']}")`
			`logging.info(f"Link: {post['link']}")`
			`logging.info(f"Published: {post['published']}")`
			`logging.info(f"Description: {post['description']}")`
			`logging.info("-" * 40)`
			`except TypeError:`
			`pass`

			`# Main function to run the watcher`
			`def rss_feed_watcher():`
			`print("RUNNING...")`
			`processed_links = load_processed_links() # Load previously processed links`
			`regular_feeds = load_feed_urls(FEEDS_FILE)`
			`# Load threat intel and regular feeds`
			`threatintel_feeds = load_feed_urls(ADVISORES)`
			`bug_bounty_feeds = load_feed_urls(BUGBOUNTY)`
			`alert_feeds = load_feed_urls(ALERTS)`

			`# Get the timestamp to compare recent posts (last 30 minutes)`
			`since_time = datetime.now() - timedelta(hours=12)`
			`since_time = since_time.replace(tzinfo=None)`
			`print("going over bug bounties...")`
			`process_bugbounty_feeds(bug_bounty_feeds,since_time, processed_links)`
			`print("going over regular feeds...")`
			`process_regular_feeds(regular_feeds, since_time, processed_links)`
			`# Process threat intel feeds`
			`print("going over threat intel...")`
			`process_threatintel_feeds(threatintel_feeds, since_time, processed_links)`
			`print("going over alerts...")`
			`process_alert_feeds(alert_feeds, since_time, processed_links)`

			`# Save updated processed links`
			`save_processed_links(processed_links)`

			`# Schedule the RSS feed watcher to run every 30 minutes`
			`def schedule_rss_watcher():`
			`schedule.every(1).hours.do(rss_feed_watcher)`
			`logging.info("RSS Feed Watcher scheduled to run every 30 minutes.")`

			`while True:`
			`schedule.run_pending()`
			`time.sleep(1)`

			`if __name__ == "__main__":`
			`rss_feed_watcher()`
			`# Start the scheduled watcher`
			`schedule_rss_watcher()`