import requests
import polars as pl
import time

def fetch_gutendex_books(num_pages=100):
    base_url = "https://gutendex.com/books/"
    all_books = []
    
    # We use a session for connection pooling (more efficient for multiple requests)
    with requests.Session() as session:
        print(f"Starting to fetch {num_pages} pages...")
        
        for page in range(1, num_pages + 1):
            try:
                # Construct the URL with query parameters
                url = f"{base_url}?page={page}&topic=literature"
                
                response = session.get(url, timeout=10)
                response.raise_for_status() # Raise error for 404/500 codes
                
                data = response.json()
                
                # The actual book data is usually inside the 'results' key
                if 'results' in data:
                    all_books.extend(data['results'])
                
                print(f"Fetched page {page}/{num_pages}", end='\r')
                
                # Be polite to the API to avoid being banned
                time.sleep(2) 
                
            except requests.exceptions.RequestException as e:
                print(f"\nError fetching page {page}: {e}")
                # Optional: break or continue depending on desired behavior
                continue

    print(f"\nParsing {len(all_books)} books into Polars DataFrame...")
    
    # Create DataFrame once from the list of dicts (Most efficient method)
    # infer_schema_length=None ensures Polars scans all rows to determine types correctly
    df = pl.DataFrame(all_books, infer_schema_length=None)
    
    return df

df = fetch_gutendex_books(50)
df_processed = (
    df
    # 0. Explode the summaries
    .explode(['summaries'])

    # 1. Keep only the first summary for each ID
    .unique(subset=["id"], keep="first")
    
    # 2. Overwrite 'authors' column with just the first author (Struct)
    .with_columns(
        pl.col("authors").list.first()
    )
    
    # 3. Unnest the struct fields into top-level columns
    # (This will likely create columns like 'name', 'birth_year', 'death_year')
    .unnest("authors")
)
