💾 Archived View for gamma.lyk.so › systems › food › scripts › scraping-foodista › cache-html.sh captured on 2024-06-16 at 12:30:06.

View Raw

More Information

⬅️ Previous capture (2023-07-22)

-=-=-=-=-=-=-

#!/usr/bin/env sh

[ "$2" ] || { >&2 echo "usage: $0 <cache directory> <url list file>" && exit; }

export CACHE_DIR="$1"

tmp="$(mktemp)"
trap 'rm "$tmp"' EXIT INT HUP

cat > "$tmp" <<"EOF"
url="$1"
path="$CACHE_DIR/$(echo "$url" | sed 's|https\?://||')"

if [ -f "$path" ]; then
  echo "Already exists, skipping: $path"
else
  echo "Caching to $path"

  dir="$(dirname "$path")"
  mkdir -p "$dir"
  curl -s -o "$path" "$url"

  # rate limit, don't be *too* obnoxious
  sleep 1
fi
EOF

chmod +x "$tmp"

cat "$2" | xargs -P 10 -n 1 "$tmp"