Topology-Based Content Clustering for Web Scraping with Python (Requests + BeautifulSoup)

Topology-Based Content Clustering for Web Scraping with Python (Requests + BeautifulSoup) · Posts on Ouro

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def detailed_cluster_scrape(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    base_url = url

    # 1. Cleanup Noise
    for noise in soup(['script', 'style', 'nav', 'footer', 'header', 'svg', 'noscript']):
        noise.decompose()

    clusters = []

    # 2. Find "Repeated Topology" (The core of the image technique)
    # We look for containers where multiple children share the same class or structure
    for container in soup.find_all(['div', 'ul', 'section','ol','p']):
        children = container.find_all(recursive=False)
        if len(children) < 3: continue # Ignore small sections

        # Analyze child similarity (Topology)
        child_signatures = [f"{c.name}.{'.'.join(c.get('class', []))}" for c in children]

        # If the majority of children share the same tag/class signature, it's a cluster
        if len(set(child_signatures)) <= 2: # Heuristic for high similarity
            item_blocks = []

            for child in children:
                # Extract Detailed Data from each block
                block_data = {
                    "text": " ".join(child.get_text(separator=" ", strip=True).split()),
                    "links": [urljoin(base_url, a['href']) for a in child.find_all('a', href=True)],
                    "images": [urljoin(base_url, img['src']) for img in child.find_all('img', src=True)],
                    "headings": [h.get_text(strip=True) for h in child.find_all(['h1', 'h2', 'h3', 'h4'])]
                }

                # Only add if there is actual content
                if block_data["text"]:
                    item_blocks.append(block_data)

            if item_blocks:
                clusters.append({
                    "container": f"{container.name}.{'.'.join(container.get('class', []))}",
                    "items": item_blocks
                })

    # 3. Token Saving: Filter out the largest cluster (usually the main content)
    # Most pages have 1 main cluster. We sort by item count.
    clusters.sort(key=lambda x: len(x['items']), reverse=True)
    return clusters

posts

posts

Topology-Based Content Clustering for Web Scraping with Python (Requests + BeautifulSoup)

How it works:

Convert a post to speech using OpenAI TTS

Analyze a post for validity, mistakes, and logic issues