1
0

get-pg.sh 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. #!/bin/bash
  2. function usage {
  3. echo "usage: <n>$0"
  4. echo "note: n is the number of essays to download"
  5. echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
  6. echo "n | tokens"
  7. echo "--- | ---"
  8. echo "1 | 6230"
  9. echo "2 | 23619"
  10. echo "5 | 25859"
  11. echo "10 | 36888"
  12. echo "15 | 50188"
  13. echo "20 | 59094"
  14. echo "25 | 88764"
  15. echo "30 | 103121"
  16. echo "32 | 108338"
  17. echo "35 | 113403"
  18. echo "40 | 127699"
  19. echo "45 | 135896"
  20. exit 1
  21. }
  22. function has_cmd {
  23. if ! [ -x "$(command -v $1)" ]; then
  24. echo "error: $1 is not available" >&2
  25. exit 1
  26. fi
  27. }
  28. # check for: curl, html2text, tail, sed, fmt
  29. has_cmd curl
  30. has_cmd html2text
  31. has_cmd tail
  32. has_cmd sed
  33. if [ $# -ne 1 ]; then
  34. usage
  35. fi
  36. n=$1
  37. # get urls
  38. urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
  39. printf "urls:\n%s\n" "$urls"
  40. if [ -f pg.txt ]; then
  41. rm pg.txt
  42. fi
  43. c=1
  44. for url in $urls; do
  45. echo "processing $url"
  46. cc=$(printf "%03d" $c)
  47. curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
  48. cat pg-$cc-one.txt >> pg.txt
  49. cp -v pg.txt pg-$cc-all.txt
  50. c=$((c+1))
  51. # don't flood the server
  52. sleep 1
  53. done
  54. echo "done. data in pg.txt"
  55. exit 0