java网页抓取后部署web项目 一直报路径错误
开始时间:2012-07-06 09:25:00 共需查询:716条 HTTP/1.1 200 OK
java.net.MalformedURLException: unknown protocol: c
at java.net.URL.<init>(Unknown Source)
at java.net.URL.<init>(Unknown Source)
at java.net.URL.<init>(Unknown Source)
at org.cyberneko.html.HTMLScanner.setInputSource(HTMLScanner.java:860)
at org.cyberneko.html.HTMLConfiguration.setInputSource(HTMLConfiguration.java:478)
at org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:451)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at org.apache.xerces.parsers.DOMParser.parse(Unknown Source)
at com.rensanning.M1Job.getHiddenValue(M1Job.java:215)
at com.rensanning.M1Job.execute(M1Job.java:78)
at org.quartz.core.JobRunShell.run(JobRunShell.java:202)
at org.quartz.simpl.SimpleThreadPool$WorkerThread.run(SimpleThreadPool.java:529)
public class BaseServelet extends HttpServlet {
public void init() throws ServletException {
super.init();
M1Job.ROOT_PATH = getServletContext().getRealPath("/");
System.out.println(M1Job.ROOT_PATH);
System.out.println(M1Job.ROOT_PATH+"html");
File file = new File(M1Job.ROOT_PATH+"html");
if (!file.exists()) {
file.mkdirs();
}
__startJob();
}
private void __startJob() {
try {
SchedulerFactory sf = new StdSchedulerFactory();
Scheduler sched = sf.getScheduler();
//每天7点钟扫描一次
String sconf1m = "0 25 9 * * ?";
M1Job m1job = new M1Job();
JobDetail job1m = new JobDetail("job1m", "group1m", m1job.getClass());
CronTrigger trigger1m = new CronTrigger("trigger1m", "group1m", "job1m", "group1m", sconf1m);
sched.addJob(job1m, true);
sched.scheduleJob(trigger1m);
sched.start();
} catch (Exception e) {
e.printStackTrace();
}
}
}
public class M1Job implements Job {
public static String ROOT_PATH = "";
private static final String HTML_TACK_HTML = "html\\tack.html";
private static final String HTML_DETAIL_HTML = "html\\detail.html";
private static String url1 = "http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=#TRACK_NUM#&track.x=%E8%BF%BD%E8%B8%AA";
private static String url2 = "http://wwwapps.ups.com/WebTracking/detail";
public final void execute(JobExecutionContext context)
throws JobExecutionException {
System.out.println("开始时间:"+getCurrentTime());
UpsDao upsdao = new UpsDao();
TdmxDao tddao = new TdmxDao();
List<Tdmx> listtd = tddao.searchTdmx();
for(int i=0; i<listtd.size();i++){
Tdmx td = listtd.get(i);
System.out.println("共需查询:"+listtd.size()+"条");
String url1new = url1.replace("#TRACK_NUM#", td.getBl1().trim()); //URL替换的运单参数
String jbno = td.getJbno(); //工作编号
//如果upstrac表有相对记录,先执行删除
if(upsdao.DeleteUpstrace(jbno)>0)
{
upsdao.DeleteUpstrace(jbno);
}
try {
//抓取追踪信息页面HTML
getHtml(url1new, ROOT_PATH+HTML_TACK_HTML, null);
//获取 抓取运输进程页面HTML时 需要的参数
Map<String, String> data = getHiddenValue(ROOT_PATH+HTML_TACK_HTML);
if (data.get("trackNums") == null) {
continue;
}
//抓取运输进程页面HTML
getHtml(url2, ROOT_PATH+HTML_DETAIL_HTML, data);
//获取运输进程
List<DetailBean> list = getDetailList(ROOT_PATH+HTML_DETAIL_HTML);
//打印详细的运输进程
DetailBean bean = null;
System.out.println("地点" + "\t" + "日期" + "\t" + "当地时间" + "\t" + "处理");
for (int j = 0; j < list.size(); j++) {
bean = list.get(j);
System.out.println(bean.getLocation() + "\t" + bean.getDate() + "\t" + bean.getTime() + "\t" + bean.getOperation());
DetailBean db = new DetailBean();
db.setJbno(jbno);
db.setBl1(data.get("trackNums"));
db.setLocation(bean.getLocation());
String str =bean.getDate()+" "+bean.getTime();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm");
Date date = sdf.parse(str);
java.sql.Timestamp sqlDate = new java.sql.Timestamp(date.getTime());
db.setDate1(sqlDate);
db.setOperation(bean.getOperation());
upsdao.AddUpstrace(db);
}
String[] trackinfo = getTrackInfo(ROOT_PATH+HTML_TACK_HTML);
Tdmx uptd = new Tdmx();
String str = trackinfo[0];
String[] strs = str.split("[:]");
String strj = trackinfo[1];
String[] strjs = strj.split("[:]");
uptd.setBlagent(strs[1].replaceAll("/^ $/", ""));
uptd.setSjage(strjs[1]);
uptd.setBl1(data.get("trackNums"));
if(trackinfo[1]==null)
{
System.out.println("没有签收人,未执行更新");
}
System.out.println(strs[1].replaceAll("/^ $/", "")+"运送日期");
System.out.println(strjs[1]+"签收人");
tddao.UpdateTdmx(uptd);
System.out.println("执行更新");
System.out.println("结束时间:"+getCurrentTime());
} catch (Exception e) {
e.printStackTrace();
}
}
}
private static List<DetailBean> getDetailList(String html) throws Exception {
List<DetailBean> list = new ArrayList<DetailBean>();
DOMParser parser = new DOMParser();
parser.parse(html.replaceAll("/", "\\\\"));
Node node = parser.getDocument();
Node tb = XPathAPI.selectSingleNode(node, "//TABLE[@class='dataTable']");
if (tb == null) {
return list;
}
NodeList tdlist = XPathAPI.selectNodeList(tb, "//TR/TD");
int line = 0;
while (line < tdlist.getLength() / 4) {
DetailBean bean = new DetailBean();
bean.setLocation(deleteSpace(tdlist.item(line * 4 + 0).getTextContent()));
bean.setDate(deleteSpace(tdlist.item(line * 4 + 1).getTextContent()));
bean.setTime(deleteSpace(tdlist.item(line * 4 + 2).getTextContent()));
bean.setOperation(deleteSpace(tdlist.item(line * 4 + 3).getTextContent()));
line++;
list.add(bean);
}
return list;
}
private static String removeSpace(String in) {
char[] charArray = in.toCharArray();
for (int i = 0; i < charArray.length; i++) {
if (charArray[i] == (char)160) {
charArray[i] = (char)32;
}
}
return new String(charArray);
}
private static String[] getTrackInfo(String html) throws Exception {
DOMParser parser = new DOMParser();
parser.parse(html.replaceAll("/", "\\\\"));
Node node = parser.getDocument();
NodeList dllist = XPathAPI.selectNodeList(node, "//DL");
String[] result = new String[5];
//--运送日期
result[0] = removeSpace(deleteSpace(dllist.item(2).getTextContent()));
// result[0] = deleteSpace(dllist.item(2).getTextContent());
//--签收人
if(dllist.getLength() == 8) {
result[1] = deleteSpace(dllist.item(4).getTextContent());
} else {
result[1] = deleteSpace(dllist.item(3).getTextContent());
}
return result;
}
private static Map<String, String> getHiddenValue(String html) throws Exception {
Map<String, String> data = new HashMap<String, String>();
List<String> params = new ArrayList<String>();
params.add("loc".toLowerCase());
params.add("USER_HISTORY_LIST".toLowerCase());
params.add("progressIsLoaded".toLowerCase());
params.add("refresh_sii".toLowerCase());
params.add("showSpPkgProg1".toLowerCase());
params.add("datakey".toLowerCase());
params.add("HIDDEN_FIELD_SESSION".toLowerCase());
params.add("trackNums".toLowerCase());
DOMParser parser = new DOMParser();
parser.parse(html.replaceAll("/", "\\\\"));
Node node = parser.getDocument();
NodeList nodeList = XPathAPI.selectNodeList(node, "//INPUT");
for (int i = 0; i < nodeList.getLength(); i++) {
Element e = (Element) nodeList.item(i);
if ("hidden".equalsIgnoreCase(e.getAttribute("type"))
&& params.contains(e.getAttribute("name").toLowerCase())) {
data.put(e.getAttribute("name"), e.getAttribute("value"));
}
}
System.out.println("运单编号:" + data.get("trackNums"));
return data;
}
不知道到底哪里错了 求解决
[解决办法]
DOMParser parser = new DOMParser();
java.io.InputStream input = M1Job.class.getResourceAsStream( "/ " + html);
parser.parse(new org.xml.sax.InputSource(input));
input.close();
Node node = parser.getDocument();