您现在的位置是:网站首页> 编程开发> java 编程开发
java-采集省市县街道四级联动源码
jeef2021-03-23【java】
2931人已围观
简介通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。需要引hutool的pom文件,可以自己去找最新版<dependency> <groupId>cn.hu
java-采集省市县街道四级联动源码
最后更新:2021-03-23 16:20:18
推荐指数:
通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html
数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。
需要引hutool的pom文件,可以自己去找最新版
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.5.5</version>
</dependency>
package com.pss.mall.admin.controller;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONArray;
import com.pss.mall.admin.WebApplication;
import com.pss.mall.common.util.SpringUtil;
import com.pss.mall.entity.model.AreaNew;
import com.pss.mall.service.AreaNewService;
import com.pss.mall.service.impl.AreaNewServiceImpl;
import org.springframework.boot.SpringApplication;
import org.springframework.context.ApplicationContext;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Description:地址采集
* @Author: Jeff
* @Date: 2021/3/22 13:50
*/
public class test {
public static void main(String[] args) {
SpringApplication.run(WebApplication.class, args);
String strurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";//目标URL
try {
getP(strurl);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 读取一个网页全部内容
*/
public static String getOneHtml(String htmlurl) {
Map<String, Object> headers = new HashMap<>();
headers.put("Cookie", "wzws_cid=d21de43cf846b12ee9804e34afdbb29987840ab591e7ee9c7b0c63342ee8da64554032b01e999718fe7a51320bce6eee6998194cc56c072746f8d5e8ee6beb6a41b1f21fae7b02dfc6830409f5e2f669f2eaa5b0b7a523dd4917aaee4ddeca99; path=/; HttpOnly; expires=Tue, 23 Mar 2021 04:14:07 GMT");
headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0");
headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
String str = HttpUtil.post(htmlurl, headers, -1);
System.out.println(str);
return str;
}
/**
* @param s
* @return 获得网页省
*/
public static void getP(String s) {
String str = getOneHtml(s);
String regex1;
String regex2;
String regex3;
String regex4;
regex1 = "<a href='(.*?)'>(.*?)<br/></a>";//省
regex2 = "<tr class='citytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?).html'>(.*?)</a></td></tr>";//市
regex3 = "<tr class='countytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a></td></tr>";//区
regex4 = "<a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a>";//街道
Pattern pa1 = Pattern.compile(regex1, Pattern.CANON_EQ);
Pattern pa2 = Pattern.compile(regex2, Pattern.CANON_EQ);
Pattern pa3 = Pattern.compile(regex3, Pattern.CANON_EQ);
Pattern pa4 = Pattern.compile(regex4, Pattern.CANON_EQ);
Matcher ma = pa1.matcher(str);
while (ma.find()) {
List<AreaNew> eList = new ArrayList<>();
String code = ma.group(1).replaceAll(".html", "");
AreaNew province = new AreaNew();
province.setAreaId(Long.valueOf(code + "0000000000"));
province.setAreaName(ma.group(2));
province.setLevel(1);
province.setParentId(0L);
eList.add(province);
String cityUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + ma.group(1);//市URL
String cityStr = getOneHtml(cityUrl);
// Thread.sleep(2000);
Matcher cityMa = pa2.matcher(cityStr);
while (cityMa.find()) {
AreaNew city = new AreaNew();
city.setLevel(2);
city.setAreaName(cityMa.group(4));
city.setAreaId(Long.valueOf(cityMa.group(2)));
city.setParentId(province.getAreaId());
eList.add(city);
String areaUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + cityMa.group(1);//区URL
String areaStr = getOneHtml(areaUrl);
// Thread.sleep(2000);
Matcher areaMa = pa3.matcher(areaStr);
while (areaMa.find()) {
AreaNew area = new AreaNew();
area.setLevel(3);
area.setAreaName(areaMa.group(4));
area.setAreaId(Long.valueOf(areaMa.group(2)));
area.setParentId(city.getAreaId());
eList.add(area);
String towntrUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + code + "/" + areaMa.group(1);//街道URL
String towntrStr = getOneHtml(towntrUrl);
// Thread.sleep(2000);
Matcher towntrMa = pa4.matcher(towntrStr);
while (towntrMa.find()) {
AreaNew towntr = new AreaNew();
towntr.setLevel(4);
towntr.setAreaName(towntrMa.group(4));
towntr.setAreaId(Long.valueOf(towntrMa.group(2)));
towntr.setParentId(area.getAreaId());
eList.add(towntr);
}
}
}
JSONArray eJson = new JSONArray(eList);
System.out.println(eJson);
if (eList.size() > 0) {
ApplicationContext context = SpringUtil.getApplicationContext();
AreaNewService areaNewService = context.getBean(AreaNewServiceImpl.class);
areaNewService.saveBatch(eList);
}
}
}
}
package com.pss.mall.common.util;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
/**
* @Description
* @Author: Jeff
* @Date: 2021/3/23 10:45
*/
@Component
public class SpringUtil implements ApplicationContextAware {
private static ApplicationContext applicationContext = null;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
if (SpringUtil.applicationContext == null) {
SpringUtil.applicationContext = applicationContext;
}
}
//获取applicationContext
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
//通过name获取 Bean.
public static Object getBean(String name) {
return getApplicationContext().getBean(name);
}
//通过class获取Bean.
public static <T> T getBean(Class<T> clazz) {
return getApplicationContext().getBean(clazz);
}
//通过name,以及Clazz返回指定的Bean
public static <T> T getBean(String name, Class<T> clazz) {
return getApplicationContext().getBean(name, clazz);
}
}
package com.pss.mall.entity.model;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import java.io.Serializable;
/**
* @Description ${DESCRIPTION}
* @Author: Jeff
* @Date: 2021/3/23 11:11
*/
@Data
@TableName(value = "tz_area_new")
public class AreaNew implements Serializable {
@TableField(value = "area_id")
private Long areaId;
@TableField(value = "area_name")
private String areaName;
@TableField(value = "parent_id")
private Long parentId;
@TableField(value = "level")
private Integer level;
private static final long serialVersionUID = 1L;
}很赞哦! (125)
下一篇:Linux服务器上Jdk的安装
文章评论
验证码:
