import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
import time
def is_valid_url(url):
"""Check if the URL is valid and returns a response."""
try:
response = requests.get(url, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
def get_links_from_page(url, depth, max_depth, visited_urls):
"""Recursively get all internal links from a page, respecting depth limit."""
if depth > max_depth or url in visited_urls:
return []
visited_urls.add(url)
links = []
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = link.get('href')
# Join relative URLs with the base URL
full_url = urljoin(url, href)
# Only follow internal links (same domain)
if urlparse(full_url).netloc == urlparse(url).netloc:
links.append(full_url)
# Recursively crawl the page if we're not at max depth
links += get_links_from_page(full_url, depth + 1, max_depth, visited_urls)
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return links
def generate_sitemap(url, max_depth=3):
"""Generate an XML sitemap by crawling the website."""
visited_urls = set()
urls = get_links_from_page(url, 0, max_depth, visited_urls)
# Create XML structure
urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for site_url in urls:
url_element = ET.SubElement(urlset, "url")
loc = ET.SubElement(url_element, "loc")
loc.text = site_url
# Build and save the XML sitemap
tree = ET.ElementTree(urlset)
sitemap_filename = "sitemap.xml"
tree.write(sitemap_filename)
print(f"Sitemap generated and saved as {sitemap_filename}")
def main():
"""Main function to accept user input and generate the sitemap."""
website_url = input("Enter the website URL to generate the sitemap: ").strip()
if not website_url.startswith('http'):
website_url = 'http://' + website_url
# Validate the URL
if not is_valid_url(website_url):
print(f"Invalid URL or network issue: {website_url}")
return
# Get depth input from the user
try:
max_depth = int(input("Enter the maximum crawl depth (default is 3): ").strip())
except ValueError:
max_depth = 3
# Start the sitemap generation
print(f"Generating sitemap for {website_url} with a maximum depth of {max_depth}...")
generate_sitemap(website_url, max_depth)
if __name__ == "__main__":
main()
Output
Enter the website URL to generate the sitemap: https://example.com
Enter the maximum crawl depth (default is 3): 2
Generating sitemap for https://example.com with a maximum depth of 2...
Sitemap generated and saved as sitemap.xml
Código de ejemplo copiado