#!/usr/bin/env python
# coding: utf8
"""
Data extractor (minfin.ru)
"""
import csv
import json
import os, os.path
import requests
from urllib.parse import unquote_plus, urlencode
from bs4 import BeautifulSoup, BeautifulStoneSoup
from urllib.parse import urljoin
#import mechanize
#import cookielib
import time
BASE_URL = 'https://www.minfin.ru'
CATALOG_FILE = 'data/datasets.csv'
LIST_URL_PAT = 'https://www.minfin.ru/common/module_router.php?filters_area=reset&page_id=769&area_id=57&order_57=P_DATE&dir_57=DESC&page_57='
LIST_SIZE = 15
class DataExtractor:
"""Data extractor for Minfin Budget"""
def __init__(self):
pass
def extract_catalog(self):
f = open(CATALOG_FILE, 'w')
keys = ['name', 'date', 'url', 'file_url']
s = ('\t'.join(keys)) + '\n'
f.write(s)
id = 1
while True:
listurl = LIST_URL_PAT + str(id)
print(listurl)
u = requests.get(listurl)
soup = BeautifulSoup(u.text)
lis = soup.findAll('div', attrs={'class' : 'doc-view-item doc-view ajax-link'})
for item in lis:
title = item.find('div').find('p', attrs={'class' : 'dvi-title'}).text
adate = item.find('div').find('dd').text
url = item.find('a', attrs={'class': 'ajax-link doc-view-actions-item doc-view-actions-item_view doc-view'})['data-href']
try:
file_url = item.find('a', attrs={'class' : 'doc-view-actions-item doc-view-actions-item_download'})['href']
except TypeError:
file_url = ''
s = ('\t'.join([title.strip().replace('\n', ' '), adate, BASE_URL + url, BASE_URL+file_url.strip() if len(file_url) > 0 else ''])) + '\n'
f.write(s)
if len(lis) < 15: break
id += 1
f.close()
def extract_all_raw(self):
reader = csv.DictReader(open(CATALOG_FILE, 'r'), delimiter="\t")
for item in reader:
url = item['file_url']
if url is None or len(url) == 0: continue
filename = url.rsplit('/')[-1]
filepath = 'data/raw/' + url.split('.ru/', 1)[-1].rsplit('/', 1)[0]
try:
os.makedirs(filepath)
except:
pass
if os.path.exists(filepath + '/' + filename): continue
r = requests.get(url)
with open(filepath + '/' + filename, 'wb') as f:
f.write(r.content)
print('Downloaded', url)
if __name__ == "__main__":
ext = DataExtractor()
# ext.extract_catalog()
ext.extract_all_raw()