import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
import sitemapxml
def is_valid_url(url):
"""Check if a URL is valid."""
try:
response = requests.head(url, allow_redirects=True, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
def get_soup(url):
"""Get the BeautifulSoup object for a given URL."""
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def get_page_info(url):
"""Get metadata for the page (last modified, priority)."""
try:
response = requests.head(url, allow_redirects=True, timeout=5)
last_modified = response.headers.get('Last-Modified', 'N/A')
priority = 0.5 # Default priority
return last_modified, priority
except requests.exceptions.RequestException as e:
print(f"Error fetching metadata for {url}: {e}")
return 'N/A', 0.5
def crawl_website(start_url, max_depth, current_depth=0, visited_urls=None):
"""Crawl the website to extract all internal links."""
if visited_urls is None:
visited_urls = set()
# Stop the crawl if the maximum depth is reached
if current_depth >= max_depth:
return visited_urls
soup = get_soup(start_url)
if not soup:
return visited_urls
# Add the current URL to the visited set
visited_urls.add(start_url)
# Find all links on the page
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
full_url = urljoin(start_url, href)
# Only crawl internal links
if urlparse(full_url).netloc == urlparse(start_url).netloc:
if full_url not in visited_urls:
visited_urls.add(full_url)
# Recurse into the link if the depth limit is not reached
crawl_website(full_url, max_depth, current_depth + 1, visited_urls)
return visited_urls
def generate_sitemap(urls, output_file):
"""Generate an XML sitemap and save it to a file."""
sitemap = sitemapxml.Sitemap()
for url in urls:
last_modified, priority = get_page_info(url)
sitemap.add(url, last_modified=last_modified, priority=priority)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(sitemap.toxml())
def main():
"""Main function to initiate the sitemap generation."""
start_url = input("Enter the starting URL (e.g., https://example.com): ").strip()
max_depth = int(input("Enter the maximum depth for crawling: ").strip())
output_file = input("Enter the output file name for the sitemap (e.g., sitemap.xml): ").strip()
print(f"Crawling website: {start_url} with a maximum depth of {max_depth}...")
# Start the crawling process
visited_urls = crawl_website(start_url, max_depth)
# Generate the sitemap and save it to a file
if visited_urls:
print(f"Found {len(visited_urls)} URLs. Generating sitemap...")
generate_sitemap(visited_urls, output_file)
print(f"Sitemap saved to {output_file}")
else:
print("No URLs were found to include in the sitemap.")
if __name__ == '__main__':
main()
Output
Enter the starting URL (e.g., https://example.com): https://example.com
Enter the maximum depth for crawling: 2
Enter the output file name for the sitemap (e.g., sitemap.xml): sitemap.xml
Crawling website: https://example.com with a maximum depth of 2...
Found 55 URLs. Generating sitemap...
Sitemap saved to sitemap.xml