Data Scraping for Technical Artists

Reference gathering is one of the most time-consuming parts of any art pipeline. In this project you'll build a suite of Python tools that automatically collect reference images, scrape 3D asset metadata from the web, query REST APIs, and organise everything into a searchable catalogue. The result is a reusable toolkit that saves hours of manual browsing - and a great portfolio piece.

Why Data Scraping Is Useful for TAs

Technical artists frequently need to collect and organise large amounts of data:

  • Reference libraries - downloading hundreds of reference photos for a specific theme (architecture, vegetation, costumes).
  • Asset databases - cataloguing metadata (poly counts, texture resolutions, tags) from 3D asset stores.
  • Pipeline auditing - scraping internal wiki pages or Shotgun/Ftrack for shot status dashboards.
  • Market research - gathering data on trending asset categories, pricing, or tech art job postings.
Note

Always check a site's robots.txt and terms of service before scraping. The techniques in this guide are for educational purposes and personal reference gathering. Respect rate limits and never redistribute scraped content without permission.

Web Scraping Basics with Requests & BeautifulSoup

The requests library handles HTTP, and BeautifulSoup parses the returned HTML into a traversable tree. Together they're the backbone of most Python scraping workflows.

Python
"""scraper_basics.py - Fetch a page and extract structured data."""
import requests
from bs4 import BeautifulSoup

def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup:
    """Download a page and return a BeautifulSoup object."""
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers, timeout=timeout)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def extract_image_urls(soup: BeautifulSoup, base_url: str = "") -> list[str]:
    """Pull all image URLs from a parsed page."""
    urls = []
    for img in soup.find_all("img"):
        src = img.get("src") or img.get("data-src") or ""
        if not src:
            continue
        # Handle relative URLs
        if src.startswith("//"):
            src = "https:" + src
        elif src.startswith("/"):
            src = base_url.rstrip("/") + src
        urls.append(src)
    return urls

if __name__ == "__main__":
    url = "https://example.com/gallery"
    soup = fetch_page(url)

    images = extract_image_urls(soup, base_url="https://example.com")
    print(f"Found {len(images)} images:")
    for img_url in images[:10]:
        print(f"  {img_url}")

Building a Reference Image Downloader

This script takes a list of image URLs, downloads them concurrently, and saves them into a named folder. It includes retry logic, progress reporting, and skips files that already exist.

Python
"""image_downloader.py - Download images with retries and progress tracking."""
import requests
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse

def download_image(url: str, output_dir: Path, retries: int = 3) -> dict:
    """Download a single image with retry logic."""
    filename = Path(urlparse(url).path).name or "image.jpg"
    filepath = output_dir / filename

    if filepath.exists():
        return {"url": url, "status": "skipped", "path": str(filepath)}

    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, timeout=15, stream=True)
            resp.raise_for_status()

            filepath.write_bytes(resp.content)
            return {"url": url, "status": "ok", "path": str(filepath)}
        except requests.RequestException as exc:
            if attempt == retries:
                return {"url": url, "status": "failed", "error": str(exc)}
            time.sleep(1.0 * attempt)  # backoff

    return {"url": url, "status": "failed", "error": "max retries"}

def bulk_download(
    urls: list[str],
    output_dir: str = "references",
    max_workers: int = 4,
    delay: float = 0.25,
) -> list[dict]:
    """Download multiple images concurrently."""
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {}
        for url in urls:
            future = pool.submit(download_image, url, out)
            futures[future] = url
            time.sleep(delay)  # stagger requests

        for future in as_completed(futures):
            result = future.result()
            results.append(result)
            status = result["status"]
            print(f"[{status.upper():>7}] {futures[future][:80]}")

    ok = sum(1 for r in results if r["status"] == "ok")
    print(f"\nDownloaded {ok}/{len(urls)} images to {output_dir}/")
    return results

if __name__ == "__main__":
    from scraper_basics import fetch_page, extract_image_urls

    url = "https://example.com/gallery"
    soup = fetch_page(url)
    image_urls = extract_image_urls(soup, base_url="https://example.com")

    bulk_download(image_urls, output_dir="references/gallery")
Tip

Keep max_workers low (2-4) and always include a delay between requests. Being a polite scraper keeps you from getting blocked and prevents unnecessary load on other people's servers.

Scraping 3D Asset Metadata

Many 3D asset stores display metadata - polygon count, texture resolution, file format, tags - directly in the page HTML. The scraper below extracts this data into structured dictionaries you can later store in a database or CSV.

Python
"""asset_scraper.py - Extract 3D asset metadata from listing pages."""
import re
from dataclasses import dataclass, asdict
from scraper_basics import fetch_page

@dataclass
class AssetInfo:
    name: str
    url: str
    poly_count: int = 0
    texture_res: str = ""
    file_formats: list[str] = None
    tags: list[str] = None
    price: str = ""

    def __post_init__(self):
        self.file_formats = self.file_formats or []
        self.tags = self.tags or []

def parse_poly_count(text: str) -> int:
    """Convert strings like '12.5k' or '1,200' to integers."""
    text = text.strip().lower().replace(",", "")
    match = re.match(r"([\d.]+)\s*k?", text)
    if not match:
        return 0
    value = float(match.group(1))
    if "k" in text:
        value *= 1000
    return int(value)

def scrape_asset_listing(url: str) -> list[AssetInfo]:
    """Scrape a listing page for asset cards and extract metadata."""
    soup = fetch_page(url)
    assets = []

    # Adapt these selectors to your target site
    for card in soup.select(".asset-card"):
        name_el = card.select_one(".asset-card__title")
        link_el = card.select_one("a[href]")
        poly_el = card.select_one(".asset-card__polys")
        tex_el = card.select_one(".asset-card__texture")
        format_els = card.select(".asset-card__format")
        tag_els = card.select(".asset-card__tag")
        price_el = card.select_one(".asset-card__price")

        asset = AssetInfo(
            name=name_el.get_text(strip=True) if name_el else "Unknown",
            url=link_el["href"] if link_el else "",
            poly_count=parse_poly_count(poly_el.get_text()) if poly_el else 0,
            texture_res=tex_el.get_text(strip=True) if tex_el else "",
            file_formats=[f.get_text(strip=True) for f in format_els],
            tags=[t.get_text(strip=True) for t in tag_els],
            price=price_el.get_text(strip=True) if price_el else "Free",
        )
        assets.append(asset)

    return assets

if __name__ == "__main__":
    url = "https://example.com/3d-assets?category=weapons"
    assets = scrape_asset_listing(url)

    for a in assets:
        print(f"{a.name} - {a.poly_count} polys - {a.file_formats} - {a.price}")
    print(f"\nScraped {len(assets)} assets.")

Working with APIs for Asset Data

Many services offer a proper REST API, which is far more reliable than HTML scraping. The pattern below shows how to authenticate, paginate, and collect results from a JSON API.

Python
"""api_client.py - Query a REST API for asset data with pagination."""
import requests
import time
from typing import Generator

class AssetAPIClient:
    """Generic client for a paginated asset REST API."""

    def __init__(self, base_url: str, api_key: str = ""):
        self.base_url = base_url.rstrip("/")
        self.session = requests.Session()
        if api_key:
            self.session.headers["Authorization"] = f"Bearer {api_key}"
        self.session.headers["Accept"] = "application/json"

    def search(
        self,
        query: str,
        category: str = "",
        per_page: int = 50,
        max_pages: int = 10,
        delay: float = 1.0,
    ) -> Generator[dict, None, None]:
        """Yield asset records across paginated results."""
        params = {"q": query, "per_page": per_page}
        if category:
            params["category"] = category

        for page in range(1, max_pages + 1):
            params["page"] = page
            resp = self.session.get(
                f"{self.base_url}/assets/search",
                params=params,
                timeout=15,
            )
            resp.raise_for_status()
            data = resp.json()

            results = data.get("results", [])
            if not results:
                break

            for item in results:
                yield {
                    "id": item.get("id"),
                    "name": item.get("name", ""),
                    "poly_count": item.get("polyCount", 0),
                    "formats": item.get("formats", []),
                    "thumbnail": item.get("thumbnailUrl", ""),
                    "license": item.get("license", "unknown"),
                }

            # Stop if we've fetched the last page
            if page >= data.get("total_pages", 1):
                break
            time.sleep(delay)

if __name__ == "__main__":
    client = AssetAPIClient(
        base_url="https://api.example.com/v1",
        api_key="your_api_key_here",
    )

    print("Searching for 'medieval weapon' assets...\n")
    for asset in client.search("medieval weapon", category="props", max_pages=3):
        print(f"  {asset['name']} - {asset['poly_count']} polys - {asset['formats']}")
Note

Popular APIs with free tiers for 3D assets include Sketchfab, Poly Haven, and Smithsonian 3D. Each has slightly different authentication and pagination schemes - adapt the client class above to match their docs.

Organizing and Cataloguing Scraped Data

Raw scraped data is only useful if you can search and filter it later. The catalogue class below stores assets in a local SQLite database so you can query them with SQL - no external database server needed.

Python
"""catalogue.py - Store and query scraped asset data in SQLite."""
import sqlite3
import json
from pathlib import Path
from contextlib import contextmanager

DB_PATH = Path("asset_catalogue.db")

@contextmanager
def get_db(db_path: Path = DB_PATH):
    """Context manager for database connections."""
    conn = sqlite3.connect(str(db_path))
    conn.row_factory = sqlite3.Row
    try:
        yield conn
        conn.commit()
    finally:
        conn.close()

def init_database(db_path: Path = DB_PATH):
    """Create tables if they don't exist."""
    with get_db(db_path) as conn:
        conn.execute("""
            CREATE TABLE IF NOT EXISTS assets (
                id TEXT PRIMARY KEY,
                name TEXT NOT NULL,
                source_url TEXT,
                poly_count INTEGER DEFAULT 0,
                texture_res TEXT DEFAULT '',
                formats TEXT DEFAULT '[]',
                tags TEXT DEFAULT '[]',
                license TEXT DEFAULT 'unknown',
                thumbnail TEXT DEFAULT '',
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        conn.execute("""
            CREATE VIRTUAL TABLE IF NOT EXISTS assets_fts
            USING fts5(name, tags, content=assets, content_rowid=rowid)
        """)

def upsert_asset(asset: dict, db_path: Path = DB_PATH):
    """Insert or update a single asset record."""
    with get_db(db_path) as conn:
        conn.execute("""
            INSERT INTO assets (id, name, source_url, poly_count, texture_res,
                                formats, tags, license, thumbnail)
            VALUES (:id, :name, :url, :poly_count, :texture_res,
                    :formats, :tags, :license, :thumbnail)
            ON CONFLICT(id) DO UPDATE SET
                name=excluded.name, poly_count=excluded.poly_count,
                formats=excluded.formats, tags=excluded.tags
        """, {
            "id": asset.get("id", asset["name"]),
            "name": asset["name"],
            "url": asset.get("url", ""),
            "poly_count": asset.get("poly_count", 0),
            "texture_res": asset.get("texture_res", ""),
            "formats": json.dumps(asset.get("formats", [])),
            "tags": json.dumps(asset.get("tags", [])),
            "license": asset.get("license", "unknown"),
            "thumbnail": asset.get("thumbnail", ""),
        })

def search_catalogue(query: str, db_path: Path = DB_PATH) -> list[dict]:
    """Full-text search the catalogue."""
    with get_db(db_path) as conn:
        rows = conn.execute("""
            SELECT a.* FROM assets a
            JOIN assets_fts fts ON a.rowid = fts.rowid
            WHERE assets_fts MATCH ?
            ORDER BY rank
            LIMIT 50
        """, (query,)).fetchall()
        return [dict(row) for row in rows]

def filter_by_poly_count(max_polys: int, db_path: Path = DB_PATH) -> list[dict]:
    """Find assets under a polygon budget."""
    with get_db(db_path) as conn:
        rows = conn.execute(
            "SELECT * FROM assets WHERE poly_count <= ? ORDER BY poly_count DESC",
            (max_polys,),
        ).fetchall()
        return [dict(row) for row in rows]

if __name__ == "__main__":
    init_database()

    # Insert sample data
    sample_assets = [
        {"name": "Medieval Sword", "poly_count": 3200, "formats": ["fbx", "obj"],
         "tags": ["weapon", "medieval", "fantasy"], "license": "CC-BY"},
        {"name": "Sci-Fi Crate", "poly_count": 800, "formats": ["fbx", "gltf"],
         "tags": ["prop", "sci-fi", "container"], "license": "CC0"},
        {"name": "Stone Wall Module", "poly_count": 450, "formats": ["fbx"],
         "tags": ["environment", "medieval", "modular"], "license": "CC-BY"},
    ]

    for asset in sample_assets:
        upsert_asset(asset)
    print(f"Inserted {len(sample_assets)} assets.\n")

    # Search
    results = search_catalogue("medieval")
    print(f"Search 'medieval': {len(results)} results")
    for r in results:
        print(f"  {r['name']} - {r['poly_count']} polys")

    # Filter
    low_poly = filter_by_poly_count(1000)
    print(f"\nAssets under 1000 polys: {len(low_poly)}")
    for r in low_poly:
        print(f"  {r['name']} - {r['poly_count']} polys")

Ethical Scraping Practices & Rate Limiting

Scraping responsibly is not optional - it's a professional requirement. Follow these guidelines:

Warning

Violating a website's terms of service can result in IP bans, legal action, or damage to your professional reputation. When in doubt, use an official API or ask permission.

  • Check robots.txt - visit https://site.com/robots.txt and respect any Disallow directives.
  • Rate limit your requests - never send more than one request per second to the same domain. Use time.sleep() between calls.
  • Identify yourself - set a descriptive User-Agent header so site admins know who's accessing their content.
  • Cache aggressively - store pages locally after the first fetch so repeated runs don't re-download.
  • Don't redistribute - scraped data is for personal/internal use. Never republish someone else's content.
  • Prefer APIs - if a site offers an API, always use it instead of scraping HTML. APIs are more stable and explicitly sanctioned.
Python
"""rate_limiter.py - Simple rate limiter and cache for polite scraping."""
import time
import hashlib
import json
from pathlib import Path
from functools import wraps

class RateLimiter:
    """Enforce a minimum delay between calls to the same domain."""

    def __init__(self, min_delay: float = 1.0):
        self.min_delay = min_delay
        self._last_call: dict[str, float] = {}

    def wait(self, domain: str):
        now = time.monotonic()
        last = self._last_call.get(domain, 0.0)
        elapsed = now - last
        if elapsed < self.min_delay:
            time.sleep(self.min_delay - elapsed)
        self._last_call[domain] = time.monotonic()

class PageCache:
    """Cache fetched pages to disk so re-runs are instant."""

    def __init__(self, cache_dir: str = ".scrape_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

    def _key(self, url: str) -> str:
        return hashlib.sha256(url.encode()).hexdigest()

    def get(self, url: str) -> str | None:
        path = self.cache_dir / self._key(url)
        if path.exists():
            return path.read_text(encoding="utf-8")
        return None

    def put(self, url: str, content: str):
        path = self.cache_dir / self._key(url)
        path.write_text(content, encoding="utf-8")

# Usage with the scraper basics
if __name__ == "__main__":
    import requests

    limiter = RateLimiter(min_delay=1.5)
    cache = PageCache()

    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3",
    ]

    for url in urls:
        cached = cache.get(url)
        if cached:
            print(f"[CACHE HIT] {url}")
            html = cached
        else:
            limiter.wait("example.com")
            print(f"[FETCHING]  {url}")
            resp = requests.get(url, timeout=10)
            html = resp.text
            cache.put(url, html)

        print(f"  Page length: {len(html)} characters")
Tip

For your portfolio, highlight the ethical considerations you built into the tool - rate limiting, caching, and robots.txt compliance. Studios value engineers who think about these things.