python爬各个国家本地AS和IP地址

数据源ipip.net,没会员,只能爬网站自己解析了-_-!,防止ip被封,没有启用多线程,后台运行慢慢解析吧

记录下代码

#! /usr/bin/env python
#-*-coding:utf-8-*-
import sys
import bs4
import requests
import json
class HTML_OBJ:
    def __init__(self,country=None):
        self.url='https://whois.ipip.net/'
        self.country = country
        self.As_dic = {}
        self.IPsub_list=[]
        self.headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/83.0.4103.97 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;'
                    'q=0.9,image/webp,image/apng,*/*;'
                    'q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    def get_as(self):
        url = self.url+'countries/'+self.country
        resp = requests.get(url=url,headers=self.headers)
        soup = bs4.BeautifulSoup(resp.text, 'lxml')
        elements = soup.select('a')
        for element in elements:
            span = element.text
            span_des = element.get('title') 
            if span_des:
                self.As_dic.update({span:{'descriptions':span_des}})
        return self.As_dic
    def get_ipsub(self,ASN):
        url = self.url+ASN
        resp = requests.get(url=url,headers=self.headers)
        soup = bs4.BeautifulSoup(resp.text, 'lxml')
        elements = soup.select_one('.table-responsive').select('a')
        for element in elements:
            self.IPsub_list.append(element.text)
        return self.IPsub_list
    def get_json(self):
        write, flush = sys.stdout.write, sys.stdout.flush 
        AS_dic = self.get_as()
        famdic = {}
        th = 100
        for index,a in enumerate(AS_dic):
            self.get_ipsub(a)
            famdic[a]=AS_dic[a]
            famdic[a]['ipsub'] = self.IPsub_list
            self.IPsub_list=[]
            show = '{:.3f}%'.format(index/len(AS_dic))
            write(show)
            flush()
            write('\x08'*len(show))
            filename=self.country+str(th//100)
            if index > th:
                th+=100
                with open('/root/python/crawler/%s.json'%filename,'a+',newline='\n') as e:
                    e.write(json.dumps(famdic,indent=1))
                    famdic={}
        else:
            print('>>>>>>finish')
            with open('/root/python/crawler/%s.json'%(self.country+'last'),'a+',newline='\n') as e:
                e.write(json.dumps(famdic,indent=1))

if __name__ == '__main__':
    HTML_OBJ(country='KR').get_json()

数据格式

{"AS2500": {
  "descriptions": "AS2500 - WIDE-BB - WIDE Project, JP",
  "ipsub": [
   "133.4.128.0/18",
   "133.144.0.0/24",
   "133.144.0.0/16",
   "163.221.0.0/24",
   "163.221.0.0/16",
   "192.218.228.0/24",
   "202.249.0.0/18",
   "203.178.128.0/17"
  ]
 },
 "AS2501": {
  "descriptions": "AS2501 - UTnet - The University of Tokyo, JP",
  "ipsub": [
   "130.69.0.0/24",
   "130.69.0.0/16",
   "133.11.0.0/24",
   "133.11.0.0/16",
   "157.82.0.0/24",
   "157.82.0.0/16",
   "157.82.112.0/21",
   "192.51.208.0/20"
  ]
 },
 "AS2504": {
  "descriptions": "AS2504 - NCA5 - Kyoto University, JP",
  "ipsub": [
   "130.54.0.0/24",
   "130.54.0.0/16",
   "133.3.0.0/24",
   "133.3.0.0/16",
   "192.50.8.0/23",
   "192.50.24.0/23"
  ]
 },...
}

Leave a Reply

Your email address will not be published. Required fields are marked *

X