网络爬虫
网络爬虫(Web crawler),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。
1.环境依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4</version>
</dependency>
2.入门程序
//1. 打开浏览器,创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2. 输入网址,发起get请求创建HttpGet对象
HttpGet httpGet = new HttpGet("http://www.yifengfighting.cn");
//3.按回车,发起请求,返回响应,使用HttpClient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4. 解析响应,获取数据
//判断状态码是否是200
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
3.带参数的get请求
//设置请求地址是:http://yun.itheima.com/search?keys=Java
//创建URIBuilder
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
//设置参数
uriBuilder.setParameter("keys","Java");
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
4.不带参数的post请求
HttpGet httpGet = new HttpGet(“http://www.yifengfighting.cn");
–>
HttpPost httpPost = new HttpPost(“http://www.yifengfighting.cn");
5.带参数的post请求
//创建HttpPost对象,设置url访问地址
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
//声明List集合,封装表单中的参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
//设置请求地址是:http://yun.itheima.com/search?keys=Java
params.add(new BasicNameValuePair("keys","Java"));
//创建表单的Entity对象,第一个参数就是封装好的表单数据,第二个参数就是编码
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
//设置表单的Entity对象到Post请求中
httpPost.setEntity(formEntity);