wget下载页面脚本
首先配置文件:
[wusx@centos245 gather]$ cat config start=359822end=550000threads=15output=/home/wusx/gather/outputlog=/home/wusx/gather/_wget.log
?然后运行脚本:
[wusx@centos245 gather]$ cat wget.sh #!bin/sh#执行下载任务##Threads=10;#i=100000;#max=999999;filename=/home/wusx/gather/configeval `cat $filename | awk -F '=' '{print $1"="$2}'`i=$startmax=$endThreads=$threads#echo $i $max $Threads $output $logwhile(( 1 ));do url="http://www.abc.com/$i/cc.html"; out="$output/$i.info" wget -q --user-agent='Baiduspider' -O $out $url > /dev/null & i=$(($i+1)); #将当前下的文档序列,写回config文件。 sed -i "1s/start=$(($i-1))/start=$i/" /home/wusx/gather/config echo $url >> $log Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l) while [ $Running -ge $Threads ]; do #echo "Threads:${Running} >= ${Threads},sleep 30 seconds..." sleep 2 Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l) done if [ $i -ge $max ];then break; fidone?