#!/usr/bin/env python
# -*- coding: utf-8 -*-
import click
import json
import os
import glob
import csv
import datetime
import io
import xlwt
import sys
import yaml
import shutil
import hashlib
import zlib
from lxml.html import fromstring, etree
from tabulate import tabulate
import requests
import re
from pymongo import MongoClient, DESCENDING, ASCENDING
CUSTOMS_URL = 'http://stat.customs.ru/apex/f?p=201:7:4435457763961247::NO'
PREFIX_URL = 'http://stat.customs.ru/apex/'
STORAGE_PATH = 'files'
def get_filename_from_cd(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0].replace('/', '_')
def _store_files(data):
s = requests.Session()
for record in reversed(data):
r = s.head(record['url'], allow_redirects=True)
filename = get_filename_from_cd(r.headers.get('content-disposition'))
out = os.path.join(STORAGE_PATH, filename)
if os.path.exists(out):
print('Filename %s exists' % (filename))
else:
r = s.get(record['url'], allow_redirects=True)
print('Filename %s downloaded' % (filename))
f = open(out, 'wb')
f.write(r.content)
f.close()
def listfiles():
"""List files"""
data = requests.get(CUSTOMS_URL)
# data = open('page.html', encoding="utf-8").read()
js = etree.HTML(data.text).xpath('//script')[2].text
js = js.split('=', 1)[1].strip()
js = js.replace('attributes:', '"attributes":').replace('data:', '"data":').replace('children:', '"children":').strip().strip(';')
# print(js)
# js = lxml.etree.HTML(s).find('.//body/script').text
jsdata = json.loads(js)
all = []
for yd in jsdata:
year = yd['data']['title']
for gd in yd['children']:
group = gd['data']['title']
for dd in gd['children']:
doc = dd['data']['title']
doc_url = PREFIX_URL + dd['data']['attributes']['href']
all.append({'year': year, 'group' : group, 'docname' : doc, 'url' : doc_url})
print(tabulate(all))
writer = csv.DictWriter(open('filelist.csv', 'w'), fieldnames=['year', 'group', 'docname', 'url'])
writer.writeheader()
for r in all:
writer.writerow(r)
_store_files(all)
return all
@click.group()
def cli1():
pass
@cli1.command()
def collect():
"""Collect customs files"""
listfiles()
pass
cli = click.CommandCollection(sources=[cli1])
if __name__ == '__main__':
cli()