Page monitoring methodology Patterns for tracking web page changes, detecting content removal, and preserving important pages before they disappear. Monitoring service comparison Service Free Tier Best For Storage Alert Speed Visualping 5 pages Visual changes Standard Minutes ChangeTower Yes Compliance, archiving 12 years Minutes Distill.io 25 pages Element-level tracking 12 months Seconds Wachete Limited Login-protected pages 12 months Minutes UptimeRobot 50 monitors Uptime only 2 months Minutes Quick-start: Monitor a page Distill.io element monitoring // Distill.io allows CSS/XPath selectors for precise monitoring // Example selectors for common use cases: // Monitor news article headlines const newsSelector = '.article-headline, h1.title, .story-title' ; // Monitor price changes const priceSelector = '.price, .product-price, [data-price]' ; // Monitor stock/availability const availabilitySelector = '.in-stock, .availability, .stock-status' ; // Monitor specific paragraph or section const sectionSelector = '#main-content p:first-child' ; // Monitor table data const tableSelector = 'table.data-table tbody tr' ; Python monitoring script import requests import hashlib import json import smtplib from email . mime . text import MIMEText from datetime import datetime from pathlib import Path from typing import Optional from bs4 import BeautifulSoup class PageMonitor : """Simple page change monitor with local storage.""" def init ( self , storage_dir : Path ) : self . storage_dir = storage_dir self . storage_dir . mkdir ( parents = True , exist_ok = True ) self . state_file = storage_dir / 'monitor_state.json' self . state = self . _load_state ( ) def _load_state ( self ) -

dict : if self . state_file . exists ( ) : return json . loads ( self . state_file . read_text ( ) ) return { 'pages' : { } } def _save_state ( self ) : self . state_file . write_text ( json . dumps ( self . state , indent = 2 ) ) def _get_page_hash ( self , url : str , selector : str = None ) -

tuple [ str , str ] : """Get content hash and content for a page or element.""" response = requests . get ( url , timeout = 30 , headers = { 'User-Agent' : 'Mozilla/5.0 (PageMonitor/1.0)' } ) response . raise_for_status ( ) if selector : soup = BeautifulSoup ( response . text , 'html.parser' ) element = soup . select_one ( selector ) content = element . get_text ( strip = True ) if element else '' else : content = response . text content_hash = hashlib . sha256 ( content . encode ( ) ) . hexdigest ( ) return content_hash , content def add_page ( self , url : str , name : str , selector : str = None ) : """Add a page to monitor.""" content_hash , content = self . _get_page_hash ( url , selector ) self . state [ 'pages' ] [ url ] = { 'name' : name , 'selector' : selector , 'last_hash' : content_hash , 'last_check' : datetime . now ( ) . isoformat ( ) , 'last_content' : content [ : 1000 ] ,

Store preview

'change_count' : 0 } self . _save_state ( ) print ( f"Added: { name } ( { url } )" ) def check_page ( self , url : str ) -

Optional [ dict ] : """Check single page for changes.""" if url not in self . state [ 'pages' ] : return None page = self . state [ 'pages' ] [ url ] selector = page . get ( 'selector' ) try : new_hash , new_content = self . _get_page_hash ( url , selector ) except Exception as e : return { 'url' : url , 'name' : page [ 'name' ] , 'status' : 'error' , 'error' : str ( e ) } changed = new_hash != page [ 'last_hash' ] result = { 'url' : url , 'name' : page [ 'name' ] , 'status' : 'changed' if changed else 'unchanged' , 'previous_content' : page [ 'last_content' ] , 'new_content' : new_content [ : 1000 ] if changed else None } if changed : page [ 'last_hash' ] = new_hash page [ 'last_content' ] = new_content [ : 1000 ] page [ 'change_count' ] += 1

Archive the change

archive_file

self . storage_dir / f" { hashlib . md5 ( url . encode ( ) ) . hexdigest ( ) } _ { datetime . now ( ) . strftime ( '%Y%m%d_%H%M%S' ) } .txt" archive_file . write_text ( new_content ) page [ 'last_check' ] = datetime . now ( ) . isoformat ( ) self . _save_state ( ) return result def check_all ( self ) -

list [ dict ] : """Check all monitored pages.""" results = [ ] for url in self . state [ 'pages' ] : result = self . check_page ( url ) if result : results . append ( result ) return results

Usage

monitor

PageMonitor ( Path ( './page_monitor_data' ) )

Add pages to monitor

monitor . add_page ( 'https://example.com/important-page' , 'Important Page' , selector = '.main-content'

Optional: monitor specific element

)

Check for changes

results

monitor . check_all ( ) for result in results : if result [ 'status' ] == 'changed' : print ( f"CHANGED: { result [ 'name' ] } " ) print ( f" Previous: { result [ 'previous_content' ] [ : 100] } ..." ) print ( f" New: { result [ 'new_content' ] [ : 100] } ..." ) Uptime monitoring UptimeRobot API integration import requests from typing import List , Optional class UptimeRobotClient : """UptimeRobot API client for monitoring page availability.""" def init ( self , api_key : str ) : self . api_key = api_key self . base_url = "https://api.uptimerobot.com/v2" def _request ( self , endpoint : str , params : dict = None ) -

dict : data = { 'api_key' : self . api_key } if params : data . update ( params ) response = requests . post ( f" { self . base_url } / { endpoint } " , data = data ) return response . json ( ) def get_monitors ( self ) -

List [ dict ] : """Get all monitors.""" result = self . _request ( 'getMonitors' ) return result . get ( 'monitors' , [ ] ) def create_monitor ( self , friendly_name : str , url : str , monitor_type : int = 1 ) -

dict : """Create a new monitor. Types: 1=HTTP(s), 2=Keyword, 3=Ping, 4=Port """ return self . _request ( 'newMonitor' , { 'friendly_name' : friendly_name , 'url' : url , 'type' : monitor_type } ) def get_monitor_uptime ( self , monitor_id : int , custom_uptime_ratios : str = "7-30-90" ) -

dict : """Get uptime statistics for a monitor.""" return self . _request ( 'getMonitors' , { 'monitors' : monitor_id , 'custom_uptime_ratios' : custom_uptime_ratios } ) def pause_monitor ( self , monitor_id : int ) -

dict : """Pause a monitor.""" return self . _request ( 'editMonitor' , { 'id' : monitor_id , 'status' : 0 } ) def resume_monitor ( self , monitor_id : int ) -

dict : """Resume a monitor.""" return self . _request ( 'editMonitor' , { 'id' : monitor_id , 'status' : 1 } )

Usage

client

UptimeRobotClient ( 'your-api-key' )

Create monitors for important pages

client . create_monitor ( 'News Homepage' , 'https://example-news.com' ) client . create_monitor ( 'API Status' , 'https://api.example.com/health' )

Check all monitors

for monitor in client . get_monitors ( ) : status = 'UP' if monitor [ 'status' ] == 2 else 'DOWN' print ( f" { monitor [ 'friendly_name' ] } : { status } " ) RSS feed generation Generate RSS from pages without feeds import requests from bs4 import BeautifulSoup from feedgen . feed import FeedGenerator from datetime import datetime import hashlib class RSSGenerator : """Generate RSS feeds from web pages.""" def init ( self , feed_id : str , title : str , link : str ) : self . fg = FeedGenerator ( ) self . fg . id ( feed_id ) self . fg . title ( title ) self . fg . link ( href = link ) self . fg . description ( f'Auto-generated feed for { title } ' ) def add_from_page ( self , url : str , item_selector : str , title_selector : str , link_selector : str , description_selector : str = None ) : """Parse a page and add items to feed. Args: url: Page URL to parse item_selector: CSS selector for each item container title_selector: CSS selector for title (relative to item) link_selector: CSS selector for link (relative to item) description_selector: Optional CSS selector for description """ response = requests . get ( url , timeout = 30 ) soup = BeautifulSoup ( response . text , 'html.parser' ) items = soup . select ( item_selector ) for item in items [ : 20 ] :

Limit to 20 items

title_elem

item . select_one ( title_selector ) link_elem = item . select_one ( link_selector ) if not title_elem or not link_elem : continue title = title_elem . get_text ( strip = True ) link = link_elem . get ( 'href' , '' )

Make absolute URL if relative

if link . startswith ( '/' ) : from urllib . parse import urljoin link = urljoin ( url , link ) fe = self . fg . add_entry ( ) fe . id ( hashlib . md5 ( link . encode ( ) ) . hexdigest ( ) ) fe . title ( title ) fe . link ( href = link ) if description_selector : desc_elem = item . select_one ( description_selector ) if desc_elem : fe . description ( desc_elem . get_text ( strip = True ) ) fe . published ( datetime . now ( ) ) def generate_rss ( self ) -

str : """Generate RSS XML string.""" return self . fg . rss_str ( pretty = True ) . decode ( ) def save_rss ( self , filepath : str ) : """Save RSS feed to file.""" self . fg . rss_file ( filepath )

Example: Generate feed for a news site without RSS

rss

RSSGenerator ( 'https://example.com/news' , 'Example News Feed' , 'https://example.com/news' ) rss . add_from_page ( 'https://example.com/news' , item_selector = '.news-item' , title_selector = 'h2 a' , link_selector = 'h2 a' , description_selector = '.summary' )

Save the feed

rss . save_rss ( 'example_feed.xml' ) Using RSS-Bridge (self-hosted)

RSS-Bridge generates feeds for sites without them

Supports Twitter, Instagram, YouTube, and many others

Docker installation

docker pull rssbridge/rss-bridge docker run -d -p 3000 :80 rssbridge/rss-bridge

Access at http://localhost:3000

Select a bridge, enter parameters, get RSS feed URL

Social media monitoring Twitter/X archiving with Twarc

Twarc requires Twitter API credentials

Installation

pip install twarc

Configure

twarc2 configure

import subprocess import json from pathlib import Path class TwitterArchiver : """Archive Twitter searches and timelines.""" def init ( self , output_dir : Path ) : self . output_dir = output_dir self . output_dir . mkdir ( parents = True , exist_ok = True ) def search ( self , query : str , max_results : int = 100 ) -

Path : """Search tweets and save to file.""" output_file = self . output_dir / f"search_ { query . replace ( ' ' , '_' ) } .jsonl" subprocess . run ( [ 'twarc2' , 'search' , '--max-results' , str ( max_results ) , query , str ( output_file ) ] , check = True ) return output_file def get_timeline ( self , username : str , max_results : int = 100 ) -

Path : """Get user timeline.""" output_file = self . output_dir / f"timeline_ { username } .jsonl" subprocess . run ( [ 'twarc2' , 'timeline' , '--max-results' , str ( max_results ) , username , str ( output_file ) ] , check = True ) return output_file def parse_archive ( self , filepath : Path ) -

list [ dict ] : """Parse archived tweets.""" tweets = [ ] with open ( filepath ) as f : for line in f : data = json . loads ( line ) if 'data' in data : tweets . extend ( data [ 'data' ] ) return tweets Webhook notifications Send alerts on changes import requests from typing import Optional class AlertManager : """Send alerts when monitored pages change.""" def init ( self , slack_webhook : str = None , discord_webhook : str = None , email_config : dict = None ) : self . slack_webhook = slack_webhook self . discord_webhook = discord_webhook self . email_config = email_config def send_slack ( self , message : str , channel : str = None ) : """Send Slack notification.""" if not self . slack_webhook : return payload = { 'text' : message } if channel : payload [ 'channel' ] = channel requests . post ( self . slack_webhook , json = payload ) def send_discord ( self , message : str ) : """Send Discord notification.""" if not self . discord_webhook : return requests . post ( self . discord_webhook , json = { 'content' : message } ) def send_email ( self , subject : str , body : str , to : str ) : """Send email notification.""" if not self . email_config : return import smtplib from email . mime . text import MIMEText msg = MIMEText ( body ) msg [ 'Subject' ] = subject msg [ 'From' ] = self . email_config [ 'from' ] msg [ 'To' ] = to with smtplib . SMTP ( self . email_config [ 'smtp_host' ] , self . email_config [ 'smtp_port' ] ) as server : server . starttls ( ) server . login ( self . email_config [ 'username' ] , self . email_config [ 'password' ] ) server . send_message ( msg ) def alert_change ( self , page_name : str , url : str , old_content : str , new_content : str ) : """Send change alert to all configured channels.""" message = f""" Page Changed: { page_name } URL: { url } Time: { datetime . now ( ) . isoformat ( ) } Previous content (preview): { old_content [ : 200] } ... New content (preview): { new_content [ : 200] } ... """ if self . slack_webhook : self . send_slack ( message ) if self . discord_webhook : self . send_discord ( message ) Scheduled monitoring with cron Cron setup for continuous monitoring

Edit crontab

crontab -e

Check pages every 15 minutes

*/15 * * * * /usr/bin/python3 /path/to/monitor_script.py

/var/log/monitor.log 2

&1

Check critical pages every 5 minutes

*/5 * * * * /usr/bin/python3 /path/to/critical_monitor.py

/var/log/critical.log 2

&1

Daily summary report at 8 AM

0 8 * * * /usr/bin/python3 /path/to/daily_report.py Monitoring script template

!/usr/bin/env python3

"""Page monitoring script for cron execution.""" import sys from pathlib import Path from datetime import datetime

Add project to path

sys . path . insert ( 0 , str ( Path ( file ) . parent ) ) from monitor import PageMonitor from alerts import AlertManager def main ( ) :

Initialize

monitor

PageMonitor ( Path ( './data' ) ) alerts = AlertManager ( slack_webhook = 'https://hooks.slack.com/services/...' , discord_webhook = 'https://discord.com/api/webhooks/...' )

Check all pages

results

monitor . check_all ( )

Process results

changes

[ r for r in results if r [ 'status' ] == 'changed' ] errors = [ r for r in results if r [ 'status' ] == 'error' ]

Alert on changes

for change in changes : alerts . alert_change ( change [ 'name' ] , change [ 'url' ] , change [ 'previous_content' ] , change [ 'new_content' ] ) print ( f"[ { datetime . now ( ) } ] CHANGE: { change [ 'name' ] } " )

Alert on errors

for error in errors : alerts . send_slack ( f"Monitor error for { error [ 'name' ] } : { error [ 'error' ] } " ) print ( f"[ { datetime . now ( ) } ] ERROR: { error [ 'name' ] } - { error [ 'error' ] } " )

Summary

print ( f"[ { datetime . now ( ) } ] Checked { len ( results ) } pages, " f" { len ( changes ) } changes, { len ( errors ) } errors" ) if name == 'main' : main ( ) Archive on change Automatic archiving when changes detected from multiarchiver import MultiArchiver class ArchivingMonitor ( PageMonitor ) : """Page monitor that archives content when changes detected.""" def init ( self , storage_dir : Path ) : super ( ) . init ( storage_dir ) self . archiver = MultiArchiver ( ) def check_page ( self , url : str ) -

dict : """Check page and archive if changed.""" result = super ( ) . check_page ( url ) if result and result [ 'status' ] == 'changed' :

Archive to multiple services

archive_results

self . archiver . archive_url ( url ) successful_archives = [ r . archived_url for r in archive_results if r . success ] result [ 'archives' ] = successful_archives

Log archive URLs

print ( f"Archived { url } to:" ) for archive_url in successful_archives : print ( f" - { archive_url } " ) return result Monitoring strategy by use case News monitoring

News/Current Events Monitoring

Pages to monitor:

Breaking news sections

Press release pages

Government announcement pages

Company newsrooms

Monitoring frequency:

Breaking news: Every 5 minutes

Press releases: Every 15-30 minutes

General news: Every hour

Archive strategy:

Archive immediately on detection

Use both Wayback Machine and Archive.today

Save local copy with timestamp Research monitoring

Academic/Research Monitoring

Pages to monitor:

Preprint servers (arXiv, SSRN)

Journal table of contents

Conference proceedings

Researcher profiles

Monitoring frequency:

Daily for active topics

Weekly for general monitoring

Tools recommended:

Google Scholar alerts (free, built-in)

Semantic Scholar alerts

RSS feeds where available

Custom monitors for specific pages Competitive intelligence

Competitor Monitoring

Pages to monitor:

Pricing pages

Product pages

Job postings

Press releases

Executive bios

Monitoring frequency:

Pricing: Daily

Products: Daily

Jobs: Weekly

Press: Daily

Legal considerations:

Don't violate terms of service

Don't circumvent access controls

Public pages only

Don't scrape at high frequency Best practices Monitoring checklist

Before monitoring a page:

[ ] Is the page publicly accessible?

[ ] Are you respecting robots.txt?

[ ] Is monitoring frequency reasonable?

[ ] Do you have a legitimate purpose?

[ ] Are you storing data securely?

[ ] Do you have alerts configured?

[ ] Is archiving set up for important pages?

Maintenance:

[ ] Review monitors monthly

[ ] Remove stale monitors

[ ] Update selectors if pages change

[ ] Check alert delivery

[ ] Verify archives are working Rate limiting import time from functools import wraps def rate_limit ( min_interval : float = 1.0 ) : """Decorator to rate limit function calls.""" last_call = [ 0.0 ] def decorator ( func ) : @wraps ( func ) def wrapper ( * args , ** kwargs ) : elapsed = time . time ( ) - last_call [ 0 ] if elapsed < min_interval : time . sleep ( min_interval - elapsed ) last_call [ 0 ] = time . time ( ) return func ( * args , ** kwargs ) return wrapper return decorator

Usage

@rate_limit ( min_interval = 2.0 )

Max once per 2 seconds

def check_page ( url : str ) : return requests . get ( url )

安装

Store preview

Archive the change

archive_file

Usage

monitor

Add pages to monitor

Optional: monitor specific element

Check for changes

results

Usage

client

Create monitors for important pages

Check all monitors

Limit to 20 items

title_elem

Make absolute URL if relative

Example: Generate feed for a news site without RSS

rss

Save the feed

RSS-Bridge generates feeds for sites without them

Supports Twitter, Instagram, YouTube, and many others

Docker installation

Access at http://localhost:3000

Select a bridge, enter parameters, get RSS feed URL

Twarc requires Twitter API credentials

Installation

pip install twarc

Configure

twarc2 configure

Edit crontab

Check pages every 15 minutes

Check critical pages every 5 minutes

Daily summary report at 8 AM

!/usr/bin/env python3

Add project to path

Initialize

monitor

Check all pages

results

Process results

changes

Alert on changes

Alert on errors

Summary

Archive to multiple services

archive_results

Log archive URLs

Pages to monitor:

Breaking news sections

Press release pages

Government announcement pages

Monitoring frequency:

Breaking news: Every 5 minutes

Press releases: Every 15-30 minutes

Archive strategy:

Archive immediately on detection

Use both Wayback Machine and Archive.today

Pages to monitor:

Preprint servers (arXiv, SSRN)

Journal table of contents

Conference proceedings

Monitoring frequency:

Daily for active topics

Tools recommended:

Google Scholar alerts (free, built-in)

Semantic Scholar alerts

RSS feeds where available

Pages to monitor:

Pricing pages

Product pages

Job postings

Press releases

Monitoring frequency:

Pricing: Daily

Products: Daily

Jobs: Weekly

Legal considerations:

Don't violate terms of service

Don't circumvent access controls