Data Scraping for Technical Artists
Reference gathering is one of the most time-consuming parts of any art pipeline. In this project you'll build a suite of Python tools that automatically collect reference images, scrape 3D asset metadata from the web, query REST APIs, and organise everything into a searchable catalogue. The result is a reusable toolkit that saves hours of manual browsing - and a great portfolio piece.
Why Data Scraping Is Useful for TAs
Technical artists frequently need to collect and organise large amounts of data:
- Reference libraries - downloading hundreds of reference photos for a specific theme (architecture, vegetation, costumes).
- Asset databases - cataloguing metadata (poly counts, texture resolutions, tags) from 3D asset stores.
- Pipeline auditing - scraping internal wiki pages or Shotgun/Ftrack for shot status dashboards.
- Market research - gathering data on trending asset categories, pricing, or tech art job postings.
Always check a site's robots.txt and terms of service before scraping. The techniques in this guide are for educational purposes and personal reference gathering. Respect rate limits and never redistribute scraped content without permission.
Web Scraping Basics with Requests & BeautifulSoup
The requests library handles HTTP, and BeautifulSoup parses the returned HTML into a traversable tree. Together they're the backbone of most Python scraping workflows.
"""scraper_basics.py - Fetch a page and extract structured data."""
import requests
from bs4 import BeautifulSoup
def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup:
"""Download a page and return a BeautifulSoup object."""
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
def extract_image_urls(soup: BeautifulSoup, base_url: str = "") -> list[str]:
"""Pull all image URLs from a parsed page."""
urls = []
for img in soup.find_all("img"):
src = img.get("src") or img.get("data-src") or ""
if not src:
continue
# Handle relative URLs
if src.startswith("//"):
src = "https:" + src
elif src.startswith("/"):
src = base_url.rstrip("/") + src
urls.append(src)
return urls
if __name__ == "__main__":
url = "https://example.com/gallery"
soup = fetch_page(url)
images = extract_image_urls(soup, base_url="https://example.com")
print(f"Found {len(images)} images:")
for img_url in images[:10]:
print(f" {img_url}")
Building a Reference Image Downloader
This script takes a list of image URLs, downloads them concurrently, and saves them into a named folder. It includes retry logic, progress reporting, and skips files that already exist.
"""image_downloader.py - Download images with retries and progress tracking."""
import requests
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse
def download_image(url: str, output_dir: Path, retries: int = 3) -> dict:
"""Download a single image with retry logic."""
filename = Path(urlparse(url).path).name or "image.jpg"
filepath = output_dir / filename
if filepath.exists():
return {"url": url, "status": "skipped", "path": str(filepath)}
for attempt in range(1, retries + 1):
try:
resp = requests.get(url, timeout=15, stream=True)
resp.raise_for_status()
filepath.write_bytes(resp.content)
return {"url": url, "status": "ok", "path": str(filepath)}
except requests.RequestException as exc:
if attempt == retries:
return {"url": url, "status": "failed", "error": str(exc)}
time.sleep(1.0 * attempt) # backoff
return {"url": url, "status": "failed", "error": "max retries"}
def bulk_download(
urls: list[str],
output_dir: str = "references",
max_workers: int = 4,
delay: float = 0.25,
) -> list[dict]:
"""Download multiple images concurrently."""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
results = []
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {}
for url in urls:
future = pool.submit(download_image, url, out)
futures[future] = url
time.sleep(delay) # stagger requests
for future in as_completed(futures):
result = future.result()
results.append(result)
status = result["status"]
print(f"[{status.upper():>7}] {futures[future][:80]}")
ok = sum(1 for r in results if r["status"] == "ok")
print(f"\nDownloaded {ok}/{len(urls)} images to {output_dir}/")
return results
if __name__ == "__main__":
from scraper_basics import fetch_page, extract_image_urls
url = "https://example.com/gallery"
soup = fetch_page(url)
image_urls = extract_image_urls(soup, base_url="https://example.com")
bulk_download(image_urls, output_dir="references/gallery")
Keep max_workers low (2-4) and always include a delay between requests. Being a polite scraper keeps you from getting blocked and prevents unnecessary load on other people's servers.
Scraping 3D Asset Metadata
Many 3D asset stores display metadata - polygon count, texture resolution, file format, tags - directly in the page HTML. The scraper below extracts this data into structured dictionaries you can later store in a database or CSV.
"""asset_scraper.py - Extract 3D asset metadata from listing pages."""
import re
from dataclasses import dataclass, asdict
from scraper_basics import fetch_page
@dataclass
class AssetInfo:
name: str
url: str
poly_count: int = 0
texture_res: str = ""
file_formats: list[str] = None
tags: list[str] = None
price: str = ""
def __post_init__(self):
self.file_formats = self.file_formats or []
self.tags = self.tags or []
def parse_poly_count(text: str) -> int:
"""Convert strings like '12.5k' or '1,200' to integers."""
text = text.strip().lower().replace(",", "")
match = re.match(r"([\d.]+)\s*k?", text)
if not match:
return 0
value = float(match.group(1))
if "k" in text:
value *= 1000
return int(value)
def scrape_asset_listing(url: str) -> list[AssetInfo]:
"""Scrape a listing page for asset cards and extract metadata."""
soup = fetch_page(url)
assets = []
# Adapt these selectors to your target site
for card in soup.select(".asset-card"):
name_el = card.select_one(".asset-card__title")
link_el = card.select_one("a[href]")
poly_el = card.select_one(".asset-card__polys")
tex_el = card.select_one(".asset-card__texture")
format_els = card.select(".asset-card__format")
tag_els = card.select(".asset-card__tag")
price_el = card.select_one(".asset-card__price")
asset = AssetInfo(
name=name_el.get_text(strip=True) if name_el else "Unknown",
url=link_el["href"] if link_el else "",
poly_count=parse_poly_count(poly_el.get_text()) if poly_el else 0,
texture_res=tex_el.get_text(strip=True) if tex_el else "",
file_formats=[f.get_text(strip=True) for f in format_els],
tags=[t.get_text(strip=True) for t in tag_els],
price=price_el.get_text(strip=True) if price_el else "Free",
)
assets.append(asset)
return assets
if __name__ == "__main__":
url = "https://example.com/3d-assets?category=weapons"
assets = scrape_asset_listing(url)
for a in assets:
print(f"{a.name} - {a.poly_count} polys - {a.file_formats} - {a.price}")
print(f"\nScraped {len(assets)} assets.")
Working with APIs for Asset Data
Many services offer a proper REST API, which is far more reliable than HTML scraping. The pattern below shows how to authenticate, paginate, and collect results from a JSON API.
"""api_client.py - Query a REST API for asset data with pagination."""
import requests
import time
from typing import Generator
class AssetAPIClient:
"""Generic client for a paginated asset REST API."""
def __init__(self, base_url: str, api_key: str = ""):
self.base_url = base_url.rstrip("/")
self.session = requests.Session()
if api_key:
self.session.headers["Authorization"] = f"Bearer {api_key}"
self.session.headers["Accept"] = "application/json"
def search(
self,
query: str,
category: str = "",
per_page: int = 50,
max_pages: int = 10,
delay: float = 1.0,
) -> Generator[dict, None, None]:
"""Yield asset records across paginated results."""
params = {"q": query, "per_page": per_page}
if category:
params["category"] = category
for page in range(1, max_pages + 1):
params["page"] = page
resp = self.session.get(
f"{self.base_url}/assets/search",
params=params,
timeout=15,
)
resp.raise_for_status()
data = resp.json()
results = data.get("results", [])
if not results:
break
for item in results:
yield {
"id": item.get("id"),
"name": item.get("name", ""),
"poly_count": item.get("polyCount", 0),
"formats": item.get("formats", []),
"thumbnail": item.get("thumbnailUrl", ""),
"license": item.get("license", "unknown"),
}
# Stop if we've fetched the last page
if page >= data.get("total_pages", 1):
break
time.sleep(delay)
if __name__ == "__main__":
client = AssetAPIClient(
base_url="https://api.example.com/v1",
api_key="your_api_key_here",
)
print("Searching for 'medieval weapon' assets...\n")
for asset in client.search("medieval weapon", category="props", max_pages=3):
print(f" {asset['name']} - {asset['poly_count']} polys - {asset['formats']}")
Popular APIs with free tiers for 3D assets include Sketchfab, Poly Haven, and Smithsonian 3D. Each has slightly different authentication and pagination schemes - adapt the client class above to match their docs.
Organizing and Cataloguing Scraped Data
Raw scraped data is only useful if you can search and filter it later. The catalogue class below stores assets in a local SQLite database so you can query them with SQL - no external database server needed.
"""catalogue.py - Store and query scraped asset data in SQLite."""
import sqlite3
import json
from pathlib import Path
from contextlib import contextmanager
DB_PATH = Path("asset_catalogue.db")
@contextmanager
def get_db(db_path: Path = DB_PATH):
"""Context manager for database connections."""
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
try:
yield conn
conn.commit()
finally:
conn.close()
def init_database(db_path: Path = DB_PATH):
"""Create tables if they don't exist."""
with get_db(db_path) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS assets (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
source_url TEXT,
poly_count INTEGER DEFAULT 0,
texture_res TEXT DEFAULT '',
formats TEXT DEFAULT '[]',
tags TEXT DEFAULT '[]',
license TEXT DEFAULT 'unknown',
thumbnail TEXT DEFAULT '',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS assets_fts
USING fts5(name, tags, content=assets, content_rowid=rowid)
""")
def upsert_asset(asset: dict, db_path: Path = DB_PATH):
"""Insert or update a single asset record."""
with get_db(db_path) as conn:
conn.execute("""
INSERT INTO assets (id, name, source_url, poly_count, texture_res,
formats, tags, license, thumbnail)
VALUES (:id, :name, :url, :poly_count, :texture_res,
:formats, :tags, :license, :thumbnail)
ON CONFLICT(id) DO UPDATE SET
name=excluded.name, poly_count=excluded.poly_count,
formats=excluded.formats, tags=excluded.tags
""", {
"id": asset.get("id", asset["name"]),
"name": asset["name"],
"url": asset.get("url", ""),
"poly_count": asset.get("poly_count", 0),
"texture_res": asset.get("texture_res", ""),
"formats": json.dumps(asset.get("formats", [])),
"tags": json.dumps(asset.get("tags", [])),
"license": asset.get("license", "unknown"),
"thumbnail": asset.get("thumbnail", ""),
})
def search_catalogue(query: str, db_path: Path = DB_PATH) -> list[dict]:
"""Full-text search the catalogue."""
with get_db(db_path) as conn:
rows = conn.execute("""
SELECT a.* FROM assets a
JOIN assets_fts fts ON a.rowid = fts.rowid
WHERE assets_fts MATCH ?
ORDER BY rank
LIMIT 50
""", (query,)).fetchall()
return [dict(row) for row in rows]
def filter_by_poly_count(max_polys: int, db_path: Path = DB_PATH) -> list[dict]:
"""Find assets under a polygon budget."""
with get_db(db_path) as conn:
rows = conn.execute(
"SELECT * FROM assets WHERE poly_count <= ? ORDER BY poly_count DESC",
(max_polys,),
).fetchall()
return [dict(row) for row in rows]
if __name__ == "__main__":
init_database()
# Insert sample data
sample_assets = [
{"name": "Medieval Sword", "poly_count": 3200, "formats": ["fbx", "obj"],
"tags": ["weapon", "medieval", "fantasy"], "license": "CC-BY"},
{"name": "Sci-Fi Crate", "poly_count": 800, "formats": ["fbx", "gltf"],
"tags": ["prop", "sci-fi", "container"], "license": "CC0"},
{"name": "Stone Wall Module", "poly_count": 450, "formats": ["fbx"],
"tags": ["environment", "medieval", "modular"], "license": "CC-BY"},
]
for asset in sample_assets:
upsert_asset(asset)
print(f"Inserted {len(sample_assets)} assets.\n")
# Search
results = search_catalogue("medieval")
print(f"Search 'medieval': {len(results)} results")
for r in results:
print(f" {r['name']} - {r['poly_count']} polys")
# Filter
low_poly = filter_by_poly_count(1000)
print(f"\nAssets under 1000 polys: {len(low_poly)}")
for r in low_poly:
print(f" {r['name']} - {r['poly_count']} polys")
Ethical Scraping Practices & Rate Limiting
Scraping responsibly is not optional - it's a professional requirement. Follow these guidelines:
Violating a website's terms of service can result in IP bans, legal action, or damage to your professional reputation. When in doubt, use an official API or ask permission.
- Check
robots.txt- visithttps://site.com/robots.txtand respect anyDisallowdirectives. - Rate limit your requests - never send more than one request per second to the same domain. Use
time.sleep()between calls. - Identify yourself - set a descriptive
User-Agentheader so site admins know who's accessing their content. - Cache aggressively - store pages locally after the first fetch so repeated runs don't re-download.
- Don't redistribute - scraped data is for personal/internal use. Never republish someone else's content.
- Prefer APIs - if a site offers an API, always use it instead of scraping HTML. APIs are more stable and explicitly sanctioned.
"""rate_limiter.py - Simple rate limiter and cache for polite scraping."""
import time
import hashlib
import json
from pathlib import Path
from functools import wraps
class RateLimiter:
"""Enforce a minimum delay between calls to the same domain."""
def __init__(self, min_delay: float = 1.0):
self.min_delay = min_delay
self._last_call: dict[str, float] = {}
def wait(self, domain: str):
now = time.monotonic()
last = self._last_call.get(domain, 0.0)
elapsed = now - last
if elapsed < self.min_delay:
time.sleep(self.min_delay - elapsed)
self._last_call[domain] = time.monotonic()
class PageCache:
"""Cache fetched pages to disk so re-runs are instant."""
def __init__(self, cache_dir: str = ".scrape_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _key(self, url: str) -> str:
return hashlib.sha256(url.encode()).hexdigest()
def get(self, url: str) -> str | None:
path = self.cache_dir / self._key(url)
if path.exists():
return path.read_text(encoding="utf-8")
return None
def put(self, url: str, content: str):
path = self.cache_dir / self._key(url)
path.write_text(content, encoding="utf-8")
# Usage with the scraper basics
if __name__ == "__main__":
import requests
limiter = RateLimiter(min_delay=1.5)
cache = PageCache()
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
for url in urls:
cached = cache.get(url)
if cached:
print(f"[CACHE HIT] {url}")
html = cached
else:
limiter.wait("example.com")
print(f"[FETCHING] {url}")
resp = requests.get(url, timeout=10)
html = resp.text
cache.put(url, html)
print(f" Page length: {len(html)} characters")
For your portfolio, highlight the ethical considerations you built into the tool - rate limiting, caching, and robots.txt compliance. Studios value engineers who think about these things.