Sitemap Generator - Python Programming Exercise

In this exercise, you will develop a Python program that generates a sitemap for a website. This exercise is perfect for practicing web crawling, URL retrieval, and XML generation in Python. By implementing this program, you will gain hands-on experience in handling web crawling, URL retrieval, and XML generation in Python. This exercise not only reinforces your understanding of web crawling but also helps you develop efficient coding practices for managing user interactions.

Exercise

Sitemap Generator

Objective

Develop a Python program that generates a sitemap for a website. The program should crawl a given website, retrieve all the URLs, and generate an XML sitemap that lists these URLs. Ensure the program handles various types of links (internal, external) and allows customization for the depth of the crawl. Include error handling for invalid URLs and network issues.

Example Python Exercise

Show Python Code

Copy Python Code

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
import time

def is_valid_url(url):
    """Check if the URL is valid and returns a response."""
    try:
        response = requests.get(url, timeout=5)
        return response.status_code == 200
    except requests.exceptions.RequestException:
        return False

def get_links_from_page(url, depth, max_depth, visited_urls):
    """Recursively get all internal links from a page, respecting depth limit."""
    if depth > max_depth or url in visited_urls:
        return []

visited_urls.add(url)
    links = []

try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                # Join relative URLs with the base URL
                full_url = urljoin(url, href)
                
                # Only follow internal links (same domain)
                if urlparse(full_url).netloc == urlparse(url).netloc:
                    links.append(full_url)
                    
                    # Recursively crawl the page if we're not at max depth
                    links += get_links_from_page(full_url, depth + 1, max_depth, visited_urls)

except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return links

def generate_sitemap(url, max_depth=3):
    """Generate an XML sitemap by crawling the website."""
    visited_urls = set()
    urls = get_links_from_page(url, 0, max_depth, visited_urls)

# Create XML structure
    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    for site_url in urls:
        url_element = ET.SubElement(urlset, "url")
        loc = ET.SubElement(url_element, "loc")
        loc.text = site_url

# Build and save the XML sitemap
    tree = ET.ElementTree(urlset)
    sitemap_filename = "sitemap.xml"
    tree.write(sitemap_filename)
    print(f"Sitemap generated and saved as {sitemap_filename}")

def main():
    """Main function to accept user input and generate the sitemap."""
    website_url = input("Enter the website URL to generate the sitemap: ").strip()
    if not website_url.startswith('http'):
        website_url = 'http://' + website_url

# Validate the URL
    if not is_valid_url(website_url):
        print(f"Invalid URL or network issue: {website_url}")
        return

# Get depth input from the user
    try:
        max_depth = int(input("Enter the maximum crawl depth (default is 3): ").strip())
    except ValueError:
        max_depth = 3

# Start the sitemap generation
    print(f"Generating sitemap for {website_url} with a maximum depth of {max_depth}...")
    generate_sitemap(website_url, max_depth)

if __name__ == "__main__":
    main()

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
import time


def is_valid_url(url):
    """Check if the URL is valid and returns a response."""
    try:
        response = requests.get(url, timeout=5)
        return response.status_code == 200
    except requests.exceptions.RequestException:
        return False


def get_links_from_page(url, depth, max_depth, visited_urls):
    """Recursively get all internal links from a page, respecting depth limit."""
    if depth > max_depth or url in visited_urls:
        return []

    visited_urls.add(url)
    links = []

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                # Join relative URLs with the base URL
                full_url = urljoin(url, href)
                
                # Only follow internal links (same domain)
                if urlparse(full_url).netloc == urlparse(url).netloc:
                    links.append(full_url)
                    
                    # Recursively crawl the page if we're not at max depth
                    links += get_links_from_page(full_url, depth + 1, max_depth, visited_urls)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return links


def generate_sitemap(url, max_depth=3):
    """Generate an XML sitemap by crawling the website."""
    visited_urls = set()
    urls = get_links_from_page(url, 0, max_depth, visited_urls)

    # Create XML structure
    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    for site_url in urls:
        url_element = ET.SubElement(urlset, "url")
        loc = ET.SubElement(url_element, "loc")
        loc.text = site_url

    # Build and save the XML sitemap
    tree = ET.ElementTree(urlset)
    sitemap_filename = "sitemap.xml"
    tree.write(sitemap_filename)
    print(f"Sitemap generated and saved as {sitemap_filename}")


def main():
    """Main function to accept user input and generate the sitemap."""
    website_url = input("Enter the website URL to generate the sitemap: ").strip()
    if not website_url.startswith('http'):
        website_url = 'http://' + website_url

    # Validate the URL
    if not is_valid_url(website_url):
        print(f"Invalid URL or network issue: {website_url}")
        return

    # Get depth input from the user
    try:
        max_depth = int(input("Enter the maximum crawl depth (default is 3): ").strip())
    except ValueError:
        max_depth = 3

    # Start the sitemap generation
    print(f"Generating sitemap for {website_url} with a maximum depth of {max_depth}...")
    generate_sitemap(website_url, max_depth)


if __name__ == "__main__":
    main()

Output

Enter the website URL to generate the sitemap: https://example.com
Enter the maximum crawl depth (default is 3): 2
Generating sitemap for https://example.com with a maximum depth of 2...
Sitemap generated and saved as sitemap.xml

Example Code Copied

More Python Programming Exercises of Using Extra Libraries

Explore our set of Python Programming Exercises! Specifically designed for beginners, these exercises will help you develop a solid understanding of the basics of Python. From variables and data types to control structures and simple functions, each exercise is crafted to challenge you incrementally as you build confidence in coding in Python.

Generating a List of Images as HTML
In this exercise, you will develop a Python program that generates an HTML file displaying a list of images from a specified directory. This exercise is perfec...
Retrieving System Information
In this exercise, you will develop a Python program that retrieves and displays system information, such as the operating system, CPU details, memory usage, and disk ...
Sitemap Generator V2
In this exercise, you will develop an enhanced version of a sitemap generator in Python (Sitemap Generator V2). This exercise is perfect for practicing web cra...
Exploring a Directory
In this exercise, you will develop a Python program that explores a specified directory and lists all of its contents, including files and subdirectories. This exe...
Exploring Subdirectories
In this exercise, you will develop a Python program that explores a specified directory and lists all the subdirectories within it. This exercise is perfect fo...
Working with Date and Time
In this exercise, you will develop a Python program that works with date and time. This exercise is perfect for practicing date and time manipulation, user inp...

Sitemap Generator - Python Programming Exercise

Category

Using Extra Libraries

Exercise