求救啊..正则读网页问题,高手们请帮帮忙啊
我要读取网页代码中的内容输入数据库,下面是其中一段我想要:
<table class= "os " width= "100% ">
<tr>
<th align= "left " colspan=2>
(总称)WA est 2007 ( <a style= "color: #FFFFFF " href= "/osOview/Foll/spId/1/sitel/0 "> Foll </a> , Asiall)
<br/>
<div class= "smallText " style= "font-weight: normal; color: #FFFFFF "> Highest (average) os and prties listed for selected bes </div>
</th>
</tr>
<tr bgcolor= "#FFFFFF ">
<td valign= "top " width= "15% ">
<a href= "/osOview/产品1-产品2/ev/97305898/site/0 ">
产品1- <br> 产品2 </a> <br/>
上市时间 24/06/07 17:30 </td>
<td style= "padding-bottom:4px; ">
<table class= "osInc " style= "margin-bottom:10px; ">
<tr>
<td width=150> <a href= "/osDetail/Foll-Asiall-WA_est_2007/evId/97305898/bTypeId/43/scId/2/sitel/0 "> 价格 </a> : <br> (2) </td>
<td width=100> Home (27%) <br> 价格1 </td> <td width=15> </td>
<td width=100> Draw (29%) <br> 价格2 </td> <td width=15> </td>
<td width=100> Away (44%) <br> 价格3 </td> <td width=15> </td>
</tr>
<tr>
<td width=150> <a href= "/osDetail/Foll-Asiall-WA_est_2007/evId/97305898/bTypeId/48/scId/2/sitel/0 "> 颜色 </a> : <br> (4) </td>
<td width=100> Home 0 <br> 颜色1 </td> <td width=15> </td>
<td width=100> Away <br> 颜色2 </td> <td width=15> </td>
</tr>
<tr>
<td width=150> <a href= "/oddsDetail/Football-Asia-WAFF_West_Asian_Football_Championship_2007/eventId/97305898/betTypeId/47/scopeId/2/site/0 "> 款式 </a> : <br> (4) </td>
<td width=100> Over 2 <br> 款式1 </td> <td width=15> </td>
<td width=100> Under <br> 款式2 </td> <td width=15> </td>
</tr>
</table>
<table class= "os " width= "100% ">
<tr>
<th align= "left " colspan=2>
(总称)WA est 2008 ( <a style= "color: #FFFFFF " href= "/osOview/Foll/spId/1/sitel/0 "> Foll </a> , Asiall)
<br/>
<div class= "smallText " style= "font-weight: normal; color: #FFFFFF "> Highest (average) os and prties listed for selected bes </div>
</th>
</tr>
<tr bgcolor= "#FFFFFF ">
<td valign= "top " width= "15% ">
<a href= "/osOview/产品1-产品2/ev/97305898/site/0 ">
产品1- <br> 产品2 </a> <br/>
上市时间 24/06/07 17:30 </td>
<td style= "padding-bottom:4px; ">
<table class= "osInc " style= "margin-bottom:10px; ">
<tr>
<td width=150> <a href= "/osDetail/Foll-Asiall-WA_est_20078/evId/97305898/bTypeId/43/scId/2/sitel/0 "> 价格 </a> : <br> (2) </td>
<td width=100> Home (27%) <br> 价格1 </td> <td width=15> </td>
<td width=100> Draw (29%) <br> 价格2 </td> <td width=15> </td>
<td width=100> Away (44%) <br> 价格3 </td> <td width=15> </td>
</tr>
<tr>
<td width=150> <a href= "/osDetail/Foll-Asiall-WA_est_2007/evId/97305898/bTypeId/48/scId/2/sitel/0 "> 颜色 </a> : <br> (4) </td>
<td width=100> Home 0 <br> 颜色1 </td> <td width=15> </td>
<td width=100> Away <br> 颜色2 </td> <td width=15> </td>
</tr>
<tr>
<td width=150> <a href= "/oddsDetail/Football-Asia-WAFF_West_Asian_Football_Championship_2007/eventId/97305898/betTypeId/47/scopeId/2/site/0 "> 款式 </a> : <br> (4) </td>
<td width=100> Over 2 <br> 款式1 </td> <td width=15> </td>
<td width=100> Under <br> 款式2 </td> <td width=15> </td>
</tr>
</table>
我想通过正则读取其中的(总称)名和产品1、产品2、价格、价格1、价格2、颜色、颜色1、颜色2、款式、款式1、款式2,读下来写进数据库的表的格式是:
总称 产品1 产品2 价格 价格1 价格2 颜色 颜色1 颜色2 款式 款式1 款式2
本人刚学vb又刚接触正则,实在是没办法啊,请各位大虾帮我想想办法啊,小弟不胜感激啊,在线等
[解决办法]
vb能用正则?
发个ASP的给你
Public Function GetNewsBody(NewsUrl)
Dim BodyHtml
BodyHtml=GetHtml(NewsUrl) '取得源码
BodyHtml=RegReplace(BodyHtml, "\n ", " ") '替换换行符
BodyHtml=RegExpTest(GetBodyRegex,BodyHtml,0) '取正文
BodyHtml=RegReplace(BodyHtml, " <script.*?> .*? </script> ", " ") '替换脚本
BodyHtml=RegReplace(BodyHtml, "( <div.*?> )(.+) ", "$2 ") '去第一个DIV
BodyHtml=RegReplace(BodyHtml, "(.+)( </div> ) ", "$1 ") '去最后一个DIV
BodyHtml=RegReplace(BodyHtml, " <form.*?> ", " ") '去form
BodyHtml=RegReplace(BodyHtml, " </form> ", " ") '去form
'下载并替换图片路径
Set re=new RegExp
re.IgnoreCase =true
re.Global=True
'下面的正则中.SubMatches(4)=文件名全名.SubMatches(5)文件扩展名
re.Pattern = "((http):(?:\/\/){1}(?:(?:\w)+[.])+(net|com|cn|org|cc|tv|[0-9]{1,4})(\S*\/)((?:\S)+[.]{1}(gif|jpg|jpeg|png|bmp))) "
Set RemoteFile = re.Execute(BodyHtml)
Dim SaveFileName
For Each RemoteFileUrl in RemoteFile
SaveFileName = RemoteFileUrl.SubMatches(4)
Call SaveRemoteFile(SaveFileFolder & SaveFileName,RemoteFileUrl) '保存图片
BodyHtml=Replace(BodyHtml,RemoteFileUrl,SaveFileFolder & SaveFileName) '替换图片路径
Next
'删除链接
if DeleteLink=true then
BodyHtml=RegReplace(BodyHtml, " <(/)?a.*?> ", " ")
end if
GetNewsBody=BodyHtml
End Function
[解决办法]
因为你要提取的内容太多了,一句句写正则很花脑筋。下面是本人以前写的一个去掉绝大部分HTML标签的VB函数,希望对你有用,同时希望得到你的加分。
Private Function htmTotxt(ByVal s As String)
Dim temp As String
Set objRegExp = New RegExp
objRegExp.IgnoreCase = True '转换-正则(大小写)
objRegExp.Global = True '全局性
objRegExp.Pattern = " <!--[\s\S]*?--> "
s = objRegExp.Replace(s, " ")
objRegExp.Pattern = " <(style)[^ <]*> [^ <]* <\/\1> " '式样表
s = objRegExp.Replace(s, " ")
objRegExp.Pattern = " <(script)[^ <]*> [\s\S]*? <\/\1> " '脚本
s = objRegExp.Replace(s, " ")
objRegExp.Pattern = " <br[^> ]*> " 'br
s = objRegExp.Replace(s, vbCrLf)
objRegExp.Pattern = " <(title)[^ <]*> [^ <]* <\/\1> " '标题
s = objRegExp.Replace(s, " ")
objRegExp.Pattern = " <[^ <]*> " 'all html
s = objRegExp.Replace(s, " ")
' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
s = Replace(s, "" ", Chr(34)) '双引号
s = Replace(s, "' ", Chr(39)) '单引号
s = Replace(s, "< ", " < ") '左 <
s = Replace(s, "> ", "> ") '右>
s = Replace(s, "( ", "( ") '左(
s = Replace(s, ") ", ") ") '右)
s = Replace(s, "* ", "* ") '*
s = Replace(s, "- ", "- ") 'SQL注释符
s = Replace(s, "& ", "& ") '&
s = Replace(s, "; ", "; ") '分号
s = Replace(s, "· ", "· ") '
' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
objRegExp.Pattern = "&[^;]{2,4}; "
s = objRegExp.Replace(s, " ")
objRegExp.Pattern = "\n[ \f\r\t\v]* "
s = objRegExp.Replace(s, vbCrLf)
objRegExp.Pattern = "[\n\x0a\x0d]+ "
s = objRegExp.Replace(s, vbCrLf) ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
objRegExp.Pattern = "^[\s]* " '开头的空白
s = objRegExp.Replace(s, " ")
Set objRegExp = Nothing
htmTotxt = s
End Function