Stop scraping the HTML. Scrape the Internal API

Stop scraping the HTML. Scrape the Internal API · Posts on Ouro
from playwright.sync_api import sync_playwright
import os
# Force the browser to use X11 instead of Wayland
os.environ["XDG_SESSION_TYPE"] = "x11"
os.environ["QT_QPA_PLATFORM"] = "xcb"
os.environ["GDK_BACKEND"] = "x11"
def intercept_network_data(target_url, api_patterns_list):
    """
    target_url: The website you are visiting.
    api_patterns_list: A LIST of strings to watch for (e.g., ["/products", "/reviews", "price-lookup"])
    """
    # We use a dictionary to separate data by the pattern that caught it
    captured_data = {pattern: [] for pattern in api_patterns_list}

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False,args=["--no-sandbox", 
                                                         # Given Below line is just linux user
                                                         #"--ozone-platform=x11"
                                                         ])
        page = browser.new_page()

        def handle_response(response):
            # Check the URL against ALL patterns in our list
            for pattern in api_patterns_list:
                if pattern in response.url and response.status == 200:
                    try:
                        json_body = response.json()
                        print(f"✅ Intercepted [{pattern}]: {response.url[:60]}...")

                        # Store the data in the correct bucket
                        captured_data[pattern].append(json_body)

                        # Break to avoid saving same request to multiple buckets if patterns overlap
                        break 
                    except:
                        pass

        # Attach the listener
        page.on("response", handle_response)

        print(f"Navigate to {target_url}...")
        page.goto(target_url)

        # Trigger logic: Scroll down to force lazy-loading APIs to fire
        print("Scrolling to trigger lazy-loaded APIs...")
        for _ in range(3): # Scroll 3 times
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(2000) # Wait for network requests to complete

        browser.close()

    return captured_data

# --- Real World Execution ---

URL = "https://scrapfly.io/web-scraping-tools/browser-fingerprint"

# Now we catch multiple different data streams at once
WATCH_LIST = [
    "prelude",       # The main product list
    "flags/",       # The separate review data
    "e/"       # The real-time stock check
]

results = intercept_network_data(URL, WATCH_LIST)

# --- Process Results ---
for pattern, data_list in results.items():
    print(f"
--- Data for '{pattern}' ---")
    print(f"Captured {len(data_list)} responses.")
    if data_list:
        # Just printing the keys of the first item to show we got real data
        first_item = data_list[0]
        if isinstance(first_item, dict):
            print(f"Keys found: {list(first_item.keys())}")
        elif isinstance(first_item, list) and len(first_item) > 0:
            print(f"List Sample: {first_item[0]}")
posts

posts

Stop scraping the HTML. Scrape the Internal API

The "Man-in-the-Middle" Strategy

Why this is the holy grail:

Convert a post to speech using OpenAI TTS

Analyze a post for validity, mistakes, and logic issues