利用百度地图API拆分纯真IP数据库地址
导出txt,并转码
利用安装后的ip.exe导出txt文件,由于python处理gbk中文的时候会有些问题,所以我们直接选个编辑器打开之后,再以utf-8的编码保存就可以了。
提取所有地址
由于我们需要通过百度地图的API一个地址一个地址的查询,所以去除重复的地址可以减少我们的查询量。
# coding: utf-8
# python的去重方法很多,这里我们选用set类型
with open('ipdata.txt', 'r', encoding='utf-8') as handle:
regex = re.compile(' +')
addrs = set([])
for line in handle:
if line.strip() != '':
address = regex.split(line.strip())[2]
addrs.add(address)
# addrs 就是我们过滤之后的所有地址了
上面我们已经过滤出所有的地址了,接下来就是使用百度的API来拆解这些地址。
import re, urllib, time, json
from urllib import parse, request
with open('ipdata.txt', 'r', encoding='utf-8') as handle:
regex = re.compile(' +')
addrs = set([])
for line in handle:
if line.strip() != '':
address = regex.split(line.strip())[2]
addrs.add(address)
url = 'http://api.map.baidu.com/geocoder?output=json&key=你的API key'
wh = open('address/list.txt', 'w', encoding="utf-8")
for addr in addrs:
addr_x = ''
url_x = url + '&address=' + parse.quote(addr)
# time.sleep(3)
req = request.urlopen(url_x)
res = req.read().decode()
data = json.loads(res)
if data['result']:
location = str(data['result']['location']['lat']) + ', ' + str(data['result']['location']['lng'])
url_x = url + '&location=' + parse.quote(location)
req = request.urlopen(url_x)
res = req.read().decode()
data = json.loads(res)
if data['result']:
component = data['result']['addressComponent']
addr_x = component['province'] + ' ' + component['city'] + ' ' + component['district']
wh.write(addr + '\t' + addr_x + '\n')
wh.close()
未完待续...