ソースを参照

scripts : improve get-pg.sh (#4838)

Georgi Gerganov 2 年 前
コミット
9a818f7c42
1 ファイル変更24 行追加1 行削除
  1. 24 1
      scripts/get-pg.sh

+ 24 - 1
scripts/get-pg.sh

@@ -2,6 +2,22 @@
 
 function usage {
     echo "usage: <n>$0"
+    echo "note: n is the number of essays to download"
+    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+    echo "n   | tokens"
+    echo "--- | ---"
+    echo "1   | 6230"
+    echo "2   | 23619"
+    echo "5   | 25859"
+    echo "10  | 36888"
+    echo "15  | 50188"
+    echo "20  | 59094"
+    echo "25  | 88764"
+    echo "30  | 103121"
+    echo "32  | 108338"
+    echo "35  | 113403"
+    echo "40  | 127699"
+    echo "45  | 135896"
     exit 1
 }
 
@@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
     rm pg.txt
 fi
 
+c=1
 for url in $urls; do
     echo "processing $url"
 
-    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+    cc=$(printf "%03d" $c)
+
+    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+    cat pg-$cc-one.txt >> pg.txt
+
+    cp -v pg.txt pg-$cc-all.txt
+    c=$((c+1))
 
     # don't flood the server
     sleep 1