get-pg.sh 792 B

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #!/bin/bash
  2. function usage {
  3. echo "usage: <n>$0"
  4. exit 1
  5. }
  6. function has_cmd {
  7. if ! [ -x "$(command -v $1)" ]; then
  8. echo "error: $1 is not available" >&2
  9. exit 1
  10. fi
  11. }
  12. # check for: curl, html2text, tail, sed, fmt
  13. has_cmd curl
  14. has_cmd html2text
  15. has_cmd tail
  16. has_cmd sed
  17. if [ $# -ne 1 ]; then
  18. usage
  19. fi
  20. n=$1
  21. # get urls
  22. urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
  23. printf "urls:\n%s\n" "$urls"
  24. if [ -f pg.txt ]; then
  25. rm pg.txt
  26. fi
  27. for url in $urls; do
  28. echo "processing $url"
  29. curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
  30. # don't flood the server
  31. sleep 1
  32. done
  33. echo "done. data in pg.txt"
  34. exit 0