Documentation Index Fetch the complete documentation index at: https://docs.scrapegraphai.com/llms.txt
Use this file to discover all available pages before exploring further.
Learn how to walk through paginated listings with ScrapeGraphAIβs Extract service. v2 does not ship a built-in total_pages parameter β instead, you iterate through each page URL yourself and merge the results. This example demonstrates how to scrape e-commerce products, news articles, or any paginated content across multiple pages.
The Goal
Weβll extract product information from an e-commerce website across multiple pages, including:
Field Description Product Name Name of the product Price Product price Rating Customer rating Image URL Product image Description Product description
Python SDK - Synchronous Example
#!/usr/bin/env python3
"""
Extract Pagination Example (Sync)
Iterate through paginated listings and merge the extracted items.
"""
import json
import os
import time
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel
from scrapegraph_py import ScrapeGraphAI
load_dotenv()
class ProductInfo ( BaseModel ):
name: str
price: Optional[ str ] = None
rating: Optional[ str ] = None
image_url: Optional[ str ] = None
description: Optional[ str ] = None
class ProductList ( BaseModel ):
products: List[ProductInfo]
def page_urls ( base : str , pages : int ) -> list[ str ]:
# Adapt this to your target site's pagination scheme.
return [ f " { base } &page= { i } " for i in range ( 1 , pages + 1 )]
def main ():
sgai = ScrapeGraphAI() # reads SGAI_API_KEY from env
base_url = "https://www.amazon.in/s?k=tv"
prompt = "Extract all product info including name, price, rating, image_url, and description"
schema = ProductList.model_json_schema()
all_products: list[ dict ] = []
start = time.time()
for url in page_urls(base_url, pages = 3 ):
res = sgai.extract(prompt, url = url, schema = schema)
if res.status != "success" :
print ( f "Page failed: { url } - { res.error } " )
continue
products = (res.data.json_data or {}).get( "products" , [])
print ( f " { url } -> { len (products) } products ( { res.elapsed_ms } ms)" )
all_products.extend(products)
print ( f " \n Done in { time.time() - start :.1f} s. Total: { len (all_products) } products" )
print (json.dumps(all_products[: 3 ], indent = 2 , ensure_ascii = False ))
if __name__ == "__main__" :
main()
Python SDK - Asynchronous Example
#!/usr/bin/env python3
"""
Extract Pagination Example (Async)
Fetch every page in parallel with AsyncScrapeGraphAI.
"""
import asyncio
import json
import time
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel
from scrapegraph_py import AsyncScrapeGraphAI
load_dotenv()
class ProductInfo ( BaseModel ):
name: str
price: Optional[ str ] = None
rating: Optional[ str ] = None
image_url: Optional[ str ] = None
description: Optional[ str ] = None
class ProductList ( BaseModel ):
products: List[ProductInfo]
async def main ():
base_url = "https://www.amazon.in/s?k=tv"
prompt = "Extract all product info including name, price, rating, image_url, and description"
schema = ProductList.model_json_schema()
urls = [ f " { base_url } &page= { i } " for i in range ( 1 , 4 )]
start = time.time()
async with AsyncScrapeGraphAI() as sgai:
tasks = [sgai.extract(prompt, url = u, schema = schema) for u in urls]
results = await asyncio.gather( * tasks)
all_products: list[ dict ] = []
for url, res in zip (urls, results):
if res.status != "success" :
print ( f "Page failed: { url } - { res.error } " )
continue
products = (res.data.json_data or {}).get( "products" , [])
print ( f " { url } -> { len (products) } products ( { res.elapsed_ms } ms)" )
all_products.extend(products)
print ( f " \n Done in { time.time() - start :.1f} s. Total: { len (all_products) } products" )
print (json.dumps(all_products[: 3 ], indent = 2 , ensure_ascii = False ))
if __name__ == "__main__" :
asyncio.run(main())
JavaScript SDK Example
import { ScrapeGraphAI } from 'scrapegraph-js' ;
import 'dotenv/config' ;
const sgai = ScrapeGraphAI (); // reads SGAI_API_KEY from env
const res = await sgai . extract ({
url: 'https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2' ,
prompt: 'Extract all product info including name, price, rating, and image_url' ,
});
if ( res . status === 'success' ) {
console . log ( 'Response:' , JSON . stringify ( res . data ?. json , null , 2 ));
}
Example Output
{
"products" : [
{
"name" : "Samsung 55-inch QLED 4K Smart TV" ,
"price" : "βΉ45,999" ,
"rating" : "4.5 out of 5 stars" ,
"image_url" : "https://example.com/samsung-tv.jpg" ,
"description" : "Experience stunning 4K resolution with Quantum Dot technology"
},
{
"name" : "LG 65-inch OLED 4K Smart TV" ,
"price" : "βΉ89,999" ,
"rating" : "4.7 out of 5 stars" ,
"image_url" : "https://example.com/lg-tv.jpg" ,
"description" : "Perfect blacks and infinite contrast with OLED technology"
},
{
"name" : "Sony 50-inch Bravia 4K Smart TV" ,
"price" : "βΉ52,999" ,
"rating" : "4.6 out of 5 stars" ,
"image_url" : "https://example.com/sony-tv.jpg" ,
"description" : "Crystal clear picture with X1 4K HDR processor"
}
]
}
v2 does not have a built-in total_pages parameter. Instead, build the list of page URLs yourself and call extract once per page β either sequentially (shown in the sync example) or concurrently via AsyncScrapeGraphAI and asyncio.gather.
For JS-rendered pagination, combine extract with FetchConfig:
from scrapegraph_py import FetchConfig
res = sgai.extract(
prompt,
url = url,
schema = schema,
fetch_config = FetchConfig( mode = "js" , scrolls = 2 , wait = 1500 ),
)
Best Practices
1. Start Small
Begin with 1-2 pages for testing
Gradually increase to your target number
Monitor API usage and rate limits
2. Optimize Prompts
Be specific about what data you want
Include pagination context in your prompt
Use structured output schemas
3. Handle Errors Gracefully
Implement proper error handling
Use try-catch blocks
Log errors for debugging
4. Consider Rate Limiting
Respect API rate limits
Use delays between requests if needed
Implement exponential backoff
Track request duration
Monitor success rates
Log pagination results
Common Use Cases
E-commerce Product Scraping
# Extract products from multiple category pages
urls = [ f "https://example-store.com/electronics?page= { i } " for i in range ( 1 , 6 )]
for url in urls:
res = sgai.extract(
"Extract all product information including name, price, rating, and availability" ,
url = url,
schema = ProductList.model_json_schema(),
)
News Article Collection
# Collect articles from multiple news pages
urls = [ f "https://example-news.com/technology?page= { i } " for i in range ( 1 , 4 )]
for url in urls:
res = sgai.extract(
"Extract article titles, summaries, publication dates, and author names" ,
url = url,
schema = ArticleList.model_json_schema(),
)
Job Listing Aggregation
# Gather job listings from multiple pages
urls = [ f "https://example-jobs.com/search?q=python&page= { i } " for i in range ( 1 , 5 )]
for url in urls:
res = sgai.extract(
"Extract job titles, companies, locations, salaries, and requirements" ,
url = url,
schema = JobList.model_json_schema(),
)
Troubleshooting
Common Issues
Pagination Not Working
Check if the website supports pagination
Verify the URL structure includes page parameters
Double-check that your URL builder produces reachable pages
Rate Limiting
Reduce the number of concurrent requests
Implement delays between requests
Check your API usage limits
Incomplete Data
Increase FetchConfig(scrolls=...) for dynamic content
Add FetchConfig(wait=...) (milliseconds) for slow-loading pages
Refine your prompt for better extraction
API Errors
Verify your API key is valid
Check the website URL is accessible
Review error messages for specific issues
Extract Learn more about our AI-powered extraction service
Python SDK Explore our Python SDK documentation
Have a suggestion for a new example? Contact us with your use case or contribute directly on GitHub .