Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- # coding: utf-8
- #
- # Copyright 2013 Alexandre Fiori
- #
- # Licensed under the Apache License, Version 2.0 (the "License"); you may
- # not use this file except in compliance with the License. You may obtain
- # a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- # License for the specific language governing permissions and limitations
- # under the License.
- import csv
- import urllib2
- import os
- import sqlite3
- import sys
- import unicodedata
- import zipfile
- dbname = "ipdb.sqlite"
- def download(url, filename=None):
- # print "Downloading" + url
- req = urllib2.Request(url, None, {"User-Agent": "Mozilla/5.0"})
- req = urllib2.urlopen(req)
- data = req.read()
- req.close()
- if filename:
- fd = open(filename, "wb")
- fd.write(data)
- fd.close()
- return data
- def extract(zipfd, zipname, filename=None):
- if filename is None:
- filename = zipname
- # print "Extracting " + filename
- try:
- data = zipfd.read(zipname)
- except KeyError:
- # print "Could not extract %s from zip archive." % zipname
- sys.exit(1)
- else:
- fd = open(filename, "w")
- fd.write(data)
- fd.close()
- def import_csv(cursor, csvfile, table, skip_lines=0):
- sys.stdout.write("Importing %s: " % csvfile)
- sys.stdout.flush()
- fd = open(csvfile)
- for n in range(skip_lines):
- fd.next()
- q = None
- rows = []
- for n, row in enumerate(csv.reader(fd), 1):
- if q is None:
- question_marks = ",".join(["?"] * len(row))
- q = "insert into %s values (%s)" % (table, question_marks)
- rows.append(map(lambda s: s.decode("latin-1"), row))
- if not n % 100000:
- sys.stderr.write(".")
- cursor.executemany(q, rows)
- rows = []
- if rows:
- cursor.executemany(q, rows)
- fd.close()
- # print "%d records!" % n
- os.unlink(csvfile)
- class world_regions(dict):
- """Imports a csv and only store rows that contains accented characters,
- indexing them by their non-accented version::
- country,region (no accents) -> region with accents
- Expected csv columns: country,region,city
- """
- def __init__(self, filename=None, autorm=True):
- self.filename = filename
- if filename:
- fd = open(filename)
- for row in csv.reader(fd):
- v = map(lambda s: s.decode("utf-8"), row[:2])
- k = self.strip_accents(",".join(v))
- if k != v:
- self[k] = v[1]
- fd.close()
- if autorm is True:
- os.unlink(filename)
- def strip_accents(self, s):
- return ''.join((c for c in unicodedata.normalize('NFD', s)
- if unicodedata.category(c) != 'Mn'))
- class world_countries(dict):
- def __init__(self, conn):
- curs = conn.cursor()
- curs.execute("SELECT country_code, country_name from country_blocks")
- for (code, name) in curs:
- self[code] = name
- curs.close()
- if __name__ == "__main__":
- region_url = "http://dev.maxmind.com/static/csv/codes/maxmind/region.csv"
- region_csv = os.path.basename(region_url)
- if not os.path.exists(region_csv):
- download(region_url, region_csv)
- wr_url = "http://blog.freegeoip.net/files/all_cities_in_the_world.csv.zip"
- wr_zip = os.path.basename(wr_url)
- wr_csv = wr_zip[:-4]
- if not os.path.exists(wr_csv):
- if not os.path.exists(wr_zip):
- download(wr_url, wr_zip)
- zipfd = zipfile.ZipFile(wr_zip)
- extract(zipfd, wr_csv, wr_csv)
- zipfd.close()
- os.unlink(wr_zip)
- geolite = "http://geolite.maxmind.com/download/geoip/database/"
- city_url = geolite + "GeoLiteCity_CSV/GeoLiteCity-latest.zip"
- city_files = ["GeoLiteCity-Blocks.csv", "GeoLiteCity-Location.csv"]
- if not all(map(os.path.exists, city_files)):
- # print "Checking " + city_url
- # Fetch the most recent city database
- city_zip = os.path.basename(city_url)
- if not os.path.exists(city_zip):
- download(city_url, city_zip)
- # Extract city csv files
- zipfd = zipfile.ZipFile(city_zip)
- for filename in zipfd.namelist():
- n = os.path.basename(filename)
- if n in city_files and not os.path.exists(n):
- extract(zipfd, filename, n)
- zipfd.close()
- os.unlink(city_zip)
- # Fetch the country database
- country_url = geolite + "GeoIPCountryCSV.zip"
- country_zip = os.path.basename(country_url)
- if not os.path.exists(country_zip):
- download(country_url, country_zip)
- country_csv = "GeoIPCountryWhois.csv"
- if not os.path.exists(country_csv):
- zipfd = zipfile.ZipFile(country_zip)
- extract(zipfd, country_csv)
- zipfd.close()
- os.unlink(country_zip)
- # Create the IP database
- tmpdb = "_" + dbname + ".temp"
- if os.path.exists(tmpdb):
- os.unlink(tmpdb)
- conn = sqlite3.connect(tmpdb)
- curs = conn.cursor()
- curs.execute("""\
- create table country_blocks (
- ip_start_str text,
- ip_end_str text,
- ip_start text,
- ip_end text,
- country_code text,
- country_name text,
- primary key(ip_start))""")
- import_csv(curs, country_csv, "country_blocks")
- curs.execute("CREATE INDEX cc_idx ON country_blocks(country_code);")
- curs.execute("""\
- create table region_names (
- country_code text,
- region_code text,
- region_name text,
- unique (country_code, region_code))""")
- import_csv(curs, region_csv, "region_names")
- curs.execute("""\
- create table city_blocks (
- ip_start int,
- ip_end int,
- loc_id int,
- primary key(ip_start))""")
- import_csv(curs, city_files[0], "city_blocks", skip_lines=2)
- curs.execute("""\
- create table city_location (
- loc_id int,
- country_code text,
- region_code text,
- city_name text,
- postal_code text,
- latitude real,
- longitude real,
- metro_code text,
- area_code text,
- primary key(loc_id))""")
- import_csv(curs, city_files[1], "city_location", skip_lines=2)
- curs.close()
- conn.commit()
- # Fix region names
- sys.stdout.write("Updating region names: ")
- sys.stdout.flush()
- world_regions = world_regions("all_cities_in_the_world.csv")
- world_countries = world_countries(conn)
- count = 0
- regions = conn.cursor()
- regions.execute("SELECT rowid, * FROM region_names")
- for region in regions:
- region_name = region[-1] # rowid,country_code,region_code,region_name
- country_name = world_countries.get(region[1])
- if country_name:
- k = country_name + "," + region_name
- if k in world_regions:
- new_name = world_regions[k]
- if region_name != new_name:
- update = conn.cursor()
- update.execute("UPDATE region_names SET region_name=? "
- "WHERE rowid=?", (new_name, region[0]))
- update.close()
- count += 1
- # print "%d names updated." % count
- # Fix db consistency. (patch from Alfredo Terrone)
- regions.execute("INSERT INTO region_names "
- "SELECT DISTINCT country_code,'','' FROM region_names")
- regions.close()
- conn.commit()
- conn.close()
- # Replace any existing db with the new one
- if os.path.exists(dbname):
- os.unlink(dbname)
- os.rename(tmpdb, dbname)
Advertisement
Add Comment
Please, Sign In to add comment