源码分享:蜘蛛抓取淘宝网上 买过某类商品的用户名
思路:通过淘宝搜索页链接
到达每个商品页;用正则表达式匹配出买过商品
用户名
并处理分页
大家看个意思吧
能运行
但taobao
购买者列表已经改成js
了
没法直接采了
=dp-j>
=keyword>package org.jason.web.spider.tabao;
=keyword>import java.io.BufferedReader;
=keyword>import java.io.InputStreamReader;
=keyword>import java.net.HttpURLConnection;
=keyword>import java.net.URL;
=keyword>import java.net.URLConnection;
=keyword>import java.util.ArrayList;
=keyword>import java.util.HashMap;
=keyword>import java.util.List;
=keyword>import java.util.Map;
=keyword>import java.util.regex.Matcher;
=keyword>import java.util.regex.Pattern;
=keyword>public =keyword> UserSpider {
=comment>//下页区域正则表达式
=keyword>private =keyword> =keyword>final String NEXT_PAGE==>" <a href=\"(.*?)\" =\"page-next\"><span>下页</span></a>";
=keyword>private =keyword> =keyword>final String LINK = =>" <h3 =\"summary\"><a href=\"(.*?)\" target=_blank onclick=\"(.*?)\" =\"EventCanSelect\">(.*?)</a></h3>";
=keyword>private =keyword> =keyword>final String USER = =>" <a href=\"http://space.taobao.com/(.*?)/portal/personal_portal.htm\" target=\"_blank\">(.*?)</a>";
=keyword>private =keyword> =keyword>final String PAR= =>"?bid_page=1&page_size=100&is_start=true" ;
=keyword>private =keyword> =keyword>final =keyword> I_BREAK = =number>2;
=comment>/**
=comment> * @param args
=comment> */
=keyword>public =keyword> =keyword>void (String args) {
Map<String,String> map = =keyword> HashMap<String,String>;
String s==>"http://search1.taobao.com/browse/0/n-g,nfyg6za----------------40--commend-0-all-0.htm?at_topsearch=1&ssid=e-s5";
List<String> l = getLinks(s);
.out.prln(=>"总链接数:"+l.size);
=keyword>for(String m : l)
{
List<String> o = getPages(m);
.out.prln(=>"页面数:"+o.size);
=keyword>for(String page : o)
{
List<String> u = getUsers(page);
.out.prln(=>"用户数:"+u.size);
=keyword>for(String user : u){
=keyword>(map.get(user)=keyword>null){
map.put(user, user);
}
}
}
}
=keyword>for(Map.Entry<String, String> entryTemp : map.entrySet){
.out.prln(entryTemp.getKey);
=comment>//保存用户...
}
}
=keyword>private =keyword> List<String> getPages(String sUrl)
{
List<String> m = =keyword> ArrayList<String>;
String sText = readURL(sUrl,=keyword>false,.getProperty(=>"line.separator"));
m = replaceAll(sText,LINK,=number>1);
=keyword> m;
}
=comment>/**
=comment> * 取得页面内用户名
=comment> * @param sUrl
=comment> * @
=comment> */
=keyword>private =keyword> List<String> getUsers(String sUrl)
{
List<String> m = =keyword> ArrayList<String>;
sUrl= sUrl+PAR;
=comment>//.out.prln("页面:"+sUrl);
String sText = readURL(sUrl,=keyword>false,.getProperty(=>"line.separator"));
m = replaceAll(sText,USER,=number>2);
=keyword> m;
}
=comment>/**
=comment> * 取得所有分页链接
=comment> * @param baseUrl
=comment> * @
=comment> */
=keyword>private =keyword> List<String> getLinks(String baseUrl)
{
List<String> s ==keyword> ArrayList<String>;
=keyword> (=>"".equals(baseUrl) || baseUrl=keyword>null){
=keyword> s;
}
String cur_url = baseUrl;
s.add(cur_url);
=keyword> i==number>0;
String sText= =>"";
=keyword>while(=keyword>true)
{
sText = readURL(cur_url,=keyword>false,.getProperty(=>"line.separator"));
=keyword>(=>"".equals(sText))
{
=keyword>;
}
cur_url==>"";
cur_url = replace(sText,NEXT_PAGE,=number>1);
=keyword>(=>"".equals(cur_url))
{
=keyword>;
}
s.add(cur_url);
=comment>//防止死循环
i;
=keyword>(i>I_BREAK)
{
=keyword>;
}
}
=keyword> s;
}
=comment>// 得到正则表达式,所匹配内容
=keyword>public =keyword> String replace(String str, String pattern, =keyword> place)
{
String result = =>"";
=keyword> (str=keyword>null || =>"".equals(str))
=keyword> result;
=keyword>
{
=keyword>try
{
Pattern p = compile(pattern, =number>2);
Matcher m = p.matcher(str);
=keyword> (m.find)
result = m.group(place);
}
=keyword>catch (Exception ex)
{
ex.prStackTrace;
}
=keyword> result;
}
}
=keyword>public =keyword> List<String> replaceAll(String str, String pattern,=keyword> i)
{
List<String> result = =keyword> ArrayList <String>;
=keyword> (str=keyword>null || =>"".equals(str))
=keyword> result;
=keyword>
{
=keyword>try
{
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(str);
=keyword>while (m.find){
result.add(m.group(i));
}
}
=keyword>catch (Exception ex)
{
ex.prStackTrace;
}
=keyword> result;
}
}
=keyword>public =keyword> Pattern compile(String pattern, =keyword> mode)
{
=keyword> Pattern.compile(pattern, mode);
}
=keyword>public =keyword> String readURL(String url, =keyword>boolean isPost, String line)
{
BufferedReader bufferedReader;
StringBuffer sBuffer = =keyword> StringBuffer;
=keyword>try
{
URL urlPath = =keyword> URL(url);
URLConnection urlConnection = urlPath.openConnection;
HttpURLConnection httpURL = (HttpURLConnection) urlConnection;
=keyword> (isPost) httpURL.RequestMethod(=>"POST");
=keyword>try
{
httpURL.connect;
=comment>// .out.prln("内容类型: "+httpURL.getContentType);
=comment>// .out.prln("内容编码: "+httpURL.getContentEncoding);
=comment>// .out.prln("内容长度: "+httpURL.getContentLength);
=comment>// .out.prln("创建日期: "+ Date(httpURL.getDate));
=comment>// .out.prln("最后修改日期: "+ Date(httpURL.getLastModied));
=comment>// .out.prln("终止日期: "+ Date(httpURL.getExpiration));
}
=keyword>catch (Exception e)
{
e.prStackTrace;
}
=keyword> httpResult = httpURL.getResponseCode;
=keyword> (httpResult HttpURLConnection.HTTP_OK)
{
bufferedReader = =keyword> BufferedReader(=keyword> InputStreamReader(httpURL.getInputStream));
String sContent;
=keyword>while ((sContent = bufferedReader.readLine) != =keyword>null)
{
sBuffer.append(sContent).append(line);
}
bufferedReader.close;
}
httpURL.disconnect;
}
=keyword>catch (Exception ex)
{
ex.prStackTrace;
}
=comment>//
=keyword> sBuffer.toString;
}
}
Tags:
延伸阅读
最新评论