{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Класс для объединения множества однотипных файлов excel в один набор данных\n",
"----------\n",
"\n",
"Постановка задачи (что наш класс должен уметь):\n",
" - собирать файлы:\n",
" - из файловой системы по маске;\n",
" - с веб-сервера (может быть даже с предварительной авторизацией);\n",
" - скачивать и доставать книги из архивов, в т.ч. если в архиве много однотипных файлов;\n",
" - объединять таблицы с данными:\n",
" - из определённых листов загружаемых книг (например первый лист каждой книги)\n",
" - перебирая все рабочие листы всех загружаемых книг\n",
" - добавлять в итоговый набор произвольное количество колонок, идентифицирующих источник:\n",
" - название файла;\n",
" - название рабочего листа;\n",
" - произвольные ячейки каждого рабочего листа, например заголовок или подпись ответственного лица, даты и т.д.\n",
" - выводить получаемый набор в выбранном формате:\n",
" - DataFrame;\n",
" - генератор словарей;\n",
" - генератор NamedTyple;\n",
" - записывать в csv или базу данных."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Установка и импорт зависимостей"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (1.0.1)\n",
"Requirement already satisfied: openpyxl in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.3)\n",
"Requirement already satisfied: xlrd in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (1.2.0)\n",
"Requirement already satisfied: chardet in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.4)\n",
"Requirement already satisfied: rarfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.1)\n",
"Requirement already satisfied: numpy>=1.13.3 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (1.18.1)\n",
"Requirement already satisfied: pytz>=2017.2 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2019.3)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: jdcal in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.4.1)\n",
"Requirement already satisfied: et_xmlfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.0.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from python-dateutil>=2.6.1->pandas) (1.14.0)\n"
]
}
],
"source": [
"!pip install pandas openpyxl xlrd chardet rarfile"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import glob\n",
"import re\n",
"from itertools import islice, chain\n",
"from collections.abc import Iterable\n",
"from urllib.parse import urlparse\n",
"from urllib.request import urlopen\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Объявление класса"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"class ExcelPreprocessor:\n",
" def __init__(self, *wb, append_global_index=False, append_wb_index=False, \n",
" filename_parser=None, append_ws_title_column=False, first_row_number=1,\n",
" iterate_over_worksheet=0, cells=None, headers=None, in_archive_re_mask=None, filter_empty_cells=False):\n",
" '''\n",
" параметр iterate_over_worksheet отвечает за то, какой именно рабочий лист в каждой прочитанной\n",
" книге будет разобран по умолчанию это лист с индексом 0, т.е. 1 в списке.\n",
" Номера листов можно задать списком, например [4, 5] (пятый и шестой соответственно)\n",
" также можно указать \"ALL\" или True, тогда будут прочитаны все листы всех книг\n",
" и данные из них объединены в один набор\n",
" '''\n",
" self.wb = []\n",
" for w in wb:\n",
" if isinstance(w, list):\n",
" self.wb += w\n",
" elif isinstance(w, str):\n",
" self.wb += [w]\n",
" else:\n",
" raise Exception(f'the \"{w}\" is of type {type(w)}, need list or str')\n",
"\n",
" self.global_index = append_global_index\n",
" self.wb_index = append_wb_index\n",
" self.filename_parser = filename_parser\n",
" self.append_ws_title_column = append_ws_title_column\n",
" self.first_row_number = first_row_number\n",
" self.iterate_over_worksheet = iterate_over_worksheet\n",
" self.cells = cells\n",
" self.headers = headers\n",
" self.in_archive_re_mask = in_archive_re_mask\n",
" self.filter_empty_cells = filter_empty_cells\n",
"\n",
" def __repr__(self):\n",
" return f'''{ExcelPreprocessor.__name__}{self.wb}:\n",
" add global index: {self.global_index}\n",
" add workbook index: {self.wb_index}\n",
" filename parser: {self.filename_parser}\n",
" append worksheet title: {self.append_ws_title_column}\n",
" first row number in worksheet: {self.first_row_number}\n",
" worksheets in all wb to iterate: {self.iterate_over_worksheet}\n",
" cells: {self.cells}\n",
" headers: {self.headers}'''\n",
"\n",
" @property\n",
" def _process_files(self):\n",
" '''\n",
" обрабатывает файлы в списке\n",
" принимает обычные файлы, zip-архивы и http - пути\n",
" '''\n",
" FN_PATTERN = '.*\\.(xls[xm]|xls)$'\n",
" \n",
" for filename in self.wb:\n",
" rf = filename\n",
" if filename.startswith('http'):\n",
" import io\n",
" with urlopen(filename) as resp:\n",
" rf = io.BytesIO(resp.read())\n",
" if re.match(FN_PATTERN, filename):\n",
" yield filename, rf\n",
" elif re.match(FN_PATTERN, filename):\n",
" yield filename, open(filename, 'rb')\n",
"\n",
" if filename.endswith('.zip'):\n",
" from zipfile import ZipFile\n",
" import chardet\n",
" ZIP_FILENAME_UTF8_FLAG = 0x800\n",
" zf = ZipFile(rf , 'r')\n",
" for info in zf.filelist:\n",
" zfilename = info.filename\n",
" if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0:\n",
" filename_bytes = zfilename.encode('437')\n",
" guessed_encoding = chardet.detect(filename_bytes)['encoding'] or 'cp1252'\n",
" zfilename = filename_bytes.decode(guessed_encoding, 'replace')\n",
" if self.in_archive_re_mask:\n",
" if not re.match(self.in_archive_re_mask, zfilename):\n",
" continue\n",
" yield str(Path(filename) / Path(zfilename)), zf.open(info.filename, 'r')\n",
"\n",
" if filename.endswith('.rar'):\n",
" import rarfile\n",
" import io\n",
" from sys import platform as _platform\n",
" \n",
" ex = Exception('платформа не поддерживается, скачайте утилиту unrar: https://www.rarlab.com/rar_add.htm')\n",
"\n",
" if _platform == \"linux\" or _platform == \"linux2\": # linux\n",
" raise ex\n",
"\n",
" elif _platform == \"win32\": # Windows\n",
" rarfile.UNRAR_TOOL = 'c:\\\\dev\\\\.jupyter\\\\jupyter_snippet\\\\UnRAR.exe'\n",
"\n",
" elif _platform == \"win64\": # Windows 64-bit\n",
" raise ex\n",
" elif _platform == 'darwin':\n",
" raise ex\n",
" else:\n",
" raise Exception('неизвестная платформа, поищите rar для неё и узнайте название (sys.platform)')\n",
"\n",
" rf = rarfile.RarFile(rf, 'r')\n",
" for info in rf.infolist():\n",
" if info.filename == 'README':\n",
" continue\n",
" \n",
" rfilename = info.filename\n",
"\n",
" if self.in_archive_re_mask:\n",
" if not re.match(self.in_archive_re_mask, rfilename):\n",
" continue\n",
" \n",
" yield str(Path(filename) / Path(rfilename)), io.BytesIO(rf.read(info.filename))\n",
"\n",
" @property\n",
" def _columns(self):\n",
" columns = []\n",
" if self.global_index:\n",
" columns.append('global_index')\n",
" if self.wb_index:\n",
" columns.append('ws_index')\n",
" if self.filename_parser:\n",
" columns += self.filename_parser(self.wb[0]).keys()\n",
" if self.append_ws_title_column:\n",
" columns.append('ws_title')\n",
" if self.cells:\n",
" columns += self.cells.keys()\n",
" if self.headers:\n",
" columns += self.headers\n",
" return columns\n",
"\n",
" def _process_xlsx(self, file_obj):\n",
" 'итерация по строкам из файла типа xls[x|m]'\n",
" import openpyxl\n",
"\n",
" wb = openpyxl.load_workbook(file_obj, read_only=True, data_only=False, keep_links=False)\n",
" if isinstance(self.iterate_over_worksheet, int):\n",
" iter_sheet = [self.iterate_over_worksheet]\n",
" elif isinstance(self.iterate_over_worksheet, list) or isinstance(self.iterate_over_worksheet, tuple):\n",
" iter_sheet = self.iterate_over_worksheet\n",
" else:\n",
" iter_sheet = [i for i, ws in enumerate(wb.worksheets) if not ws.title.startswith('hidden')]\n",
"\n",
" for ws_index in iter_sheet:\n",
" ws = wb.worksheets[ws_index]\n",
"\n",
" result = ((cell.value for cell in row) for row in ws.rows)\n",
" \n",
" if self.first_row_number:\n",
" result = islice(result, self.first_row_number - 1, None)\n",
"\n",
" if self.filter_empty_cells:\n",
" result = ((val for val in row) for row in result if any(val for val in row))\n",
"\n",
" if self.cells:\n",
" retry_cols_values = [ws[c].value for c in self.cells.values()]\n",
" result = ((*retry_cols_values, *row) for row in result)\n",
" if self.append_ws_title_column:\n",
" result = ((ws.title, *row) for row in result)\n",
"\n",
" yield from result\n",
" \n",
" def _process_xls(self, file_obj):\n",
" 'итерация по рабочим листам книг формата xls'\n",
" import xlrd\n",
" l = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'\n",
" A_Z = [a+b for a in chain([''], l) for b in l]\n",
" \n",
" def addr_to_offset(addr):\n",
" addr = re.match('([A-Z]+)(\\d+)', addr) # ('A', '14')\n",
" return [A_Z.index(addr[0]), int(addr[1]) - 1]\n",
" \n",
" wb = xlrd.open_workbook(file_contents=file_obj.read(), on_demand=True)\n",
"\n",
" if isinstance(self.iterate_over_worksheet, int):\n",
" iter_sheet = [wb.sheet_by_index(self.iterate_over_worksheet)]\n",
" elif isinstance(self.iterate_over_worksheet, list) or isinstance(self.iterate_over_worksheet, tuple):\n",
" iter_sheet = [wb.sheet_by_index(idx) for idx in self.iterate_over_worksheet]\n",
" else:\n",
" iter_sheet = [sheet for sheet in wb.sheets() if sheet.visibility==0]\n",
"\n",
" for ws in iter_sheet:\n",
" result = (ws.row_values(row_idx, start_colx=0, end_colx=None) for row_idx in range(ws.nrows))\n",
" if self.first_row_number:\n",
" result = islice(result, self.first_row_number - 1, None)\n",
"\n",
" if self.filter_empty_cells:\n",
" result = ((val for val in row) for row in result if any(val for val in row))\n",
"\n",
" if self.cells:\n",
" retry_cols_values = [ws.cell_value(*addr_to_offset(c)) for c in self.cells.values()]\n",
" result = ((*retry_cols_values, *row) for row in result)\n",
" if self.append_ws_title_column:\n",
" result = ((ws.name, *row) for row in result)\n",
"\n",
" yield from result\n",
"\n",
" def _iter_over_rows(self):\n",
" 'итерация по общей последовательности строк в итоговом наборе'\n",
" row_counter = 0\n",
" for fn, excel_file in self._process_files:\n",
" if fn.endswith('.xlsx') or fn.endswith('.xlsm'):\n",
" result = self._process_xlsx(excel_file)\n",
" else:\n",
" result = self._process_xls(excel_file)\n",
" \n",
" if self.filename_parser:\n",
" fncols = self.filename_parser(fn)\n",
" result = ((*fncols.values(), *row) for row in result)\n",
" if self.wb_index:\n",
" result = ((i, *row) for i, row in enumerate(result))\n",
" if self.global_index:\n",
" result = ((row_counter, *row) for i, row in enumerate(result))\n",
" for row in result:\n",
" yield row\n",
" row_counter += 1\n",
"\n",
" def get_dataframe(self):\n",
" 'возвращает набор данных в виде pandas DataFrame'\n",
" data = ([val for col, val in zip(self._columns, row)] for row in self._iter_over_rows())\n",
" return pd.DataFrame(data, columns=self._columns)\n",
"\n",
" def write_csv(self, csv_filename, write_header=True, dialect='excel', **fmtparams):\n",
" '''\n",
" записывает набор данных в файл csv, входные параметры такие же как у метода csv.writer\n",
" https://docs.python.org/3/library/csv.html\n",
" '''\n",
" import csv\n",
" with open(csv_filename, 'w', encoding='utf8') as wf:\n",
" writer = csv.writer(wf, dialect, **fmtparams)\n",
" writer.writerow(self._columns)\n",
" writer.writerows(self._iter_over_rows())\n",
"\n",
" def write_excel(self, excel_filename):\n",
" 'записывает набор данных в файл excel'\n",
" from openpyxl import Workbook\n",
" wb = Workbook(write_only=True)\n",
" ws = wb.create_sheet()\n",
" ws.append(self._columns)\n",
" for row in self._iter_over_rows():\n",
" ws.append(row)\n",
" wb.save(excel_filename)\n",
"\n",
" def get_iter_dict(self):\n",
" 'возвращает набор данных в виде итератора словарей'\n",
" cols = self._columns\n",
" for row in self._iter_over_rows():\n",
" yield {k:v for k, v in zip(cols, row)}\n",
"\n",
" def get_iter_nt(self):\n",
" 'возвращает набор данных в виде итерируемого набора NamedTyple'\n",
" from collections import namedtuple\n",
" cols = self._columns\n",
" nt = namedtuple(\"Row\", [re.sub(r\"[\\. \\(\\)]\", \"_\", c).lower() for c in cols])\n",
" for row in self._iter_over_rows():\n",
" yield nt(*row)\n",
"\n",
" def get_sample(self, start, stop):\n",
" 'возвращает сэмпл итогового набора (можно проверить правильность обработки данных)'\n",
" return [list(row) for row in islice(self._iter_over_rows(), start, stop)]\n",
" \n",
" def get_sample_dataframe(self, start, stop) -> pd.DataFrame:\n",
" 'возвращает сэмпл итогового набора в виде DataFrame'\n",
" data = ([val for col, val in zip(self._columns, row)] for row in islice(self._iter_over_rows(), start, stop))\n",
" return pd.DataFrame(data, columns=self._columns)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ExcelPreprocessor['data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx']:\n",
" add global index: False\n",
" add workbook index: False\n",
" filename parser: None\n",
" append worksheet title: False\n",
" first row number in worksheet: 1\n",
" worksheets in all wb to iterate: 0\n",
" cells: None\n",
" headers: None"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# создаем экземпляр класса и смотрим что получается\n",
"processor = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx')\n",
"processor"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[None, 'Код стро-ки', 'Всего ', 'из графы 1 по :', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]\n",
"[None, None, None, 'федеральным налогам и сборам', None, None, None, None, None, None, None, 'региональным налогам и сборам', 'местным налогам и сборам', 'налогам со специальным налоговым режимом', 'Единому социальному налогу', 'Из графы 13 единый социальный налог в федеральный бюджет', 'Платежам в государственные внебюджетные фонды', 'ВСЕГО задолженность по страховым взносам', None, None, None, None, None, None]\n",
"[None, None, None, 'Всего ', 'из них', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]\n",
"[None, None, None, None, 'налог на прибыль организаций', None, 'налог на добавленную стоимость ', 'из графы 5 налог на добавленную стоимость по товарам (работам, услугам), реализуемым на территории РФ', 'платежи за пользование природными ресурсами ', 'из графы 7\\n налог на добычу полезных ископаемых', 'остальные федеральные налоги и сборы', None, None, None, None, None, None, None, None, None, None, None, None, None]\n",
"[None, None, None, None, 'Всего ', 'в том числе в федеральный бюджет', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]\n",
"['А', 'Б', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, None, None, None, None, None, None]\n"
]
}
],
"source": [
"# попробуем получить кусочек данных например с 5 по 8 строку и посмотреть что получается\n",
"for row in processor.get_sample(5, 11):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1093)', 1005, 1235462116, 613079956, 137068882, 16509931, 378521695, 377407864, 7475610, 7001696, 90013769, 152239095, 76851905, 52246651, 4059156, 2911250, 892339, 336093014, None, None, None, None, None, None]\n",
"['Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРАМ, ПЕНЯМ, НАЛОГОВЫМ САНКЦИЯМ ВСЕГО В ТОМ ЧИСЛЕ:', 1010, 966783423, 470685208, 102647180, 12435295, 293731518, 292643494, 6239798, 6015450, 68066712, 122048711, 66194027, 50606745, 2990596, 2027362, 838342, 253419794, None, None, None, None, None, None]\n",
"['Уточненные данные по состоянию на 16.07.2018\\n', 'из нее\\nзадолженность, невозможная к взысканию в связи с отложением подачи заявления в арбитражный суд о признании должника банкротом', 1011, 59789575, 45827009, 6993023, 772092, 33088224, 32865309, 70488, 41193, 5675274, 369196, 357281, 3524787, 92613, 75241, 41612, 9577077, None, None, None, None, None, None]\n",
"['Уточненные данные по состоянию на 16.07.2018\\n', 'НЕДОИМКА', 1020, 738863445, 350780985, 76405303, 9214666, 221615329, 220703752, 5920609, 5756088, 46839744, 98789604, 54482333, 37990418, 896167, 605705, 190163, 195733775, None, None, None, None, None, None]\n"
]
}
],
"source": [
"# Видим что данные начинаются только с 12 строки, к тому же нам надо добавить\n",
"# в каждую строку информацию из ячейки \"A1\", назовём её \"date_ot\"\n",
"processor.cells = {'date_ot': 'A1'}\n",
"processor.first_row_number = 12\n",
"for row in processor.get_sample(0, 4):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2009', '4nm010618', 'xlsx', 'Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1093)', 1005, 1235462116, 613079956, 137068882, 16509931, 378521695, 377407864, 7475610, 7001696, 90013769, 152239095, 76851905, 52246651, 4059156, 2911250, 892339, 336093014, None, None, None, None, None, None]\n",
"['2009', '4nm010618', 'xlsx', 'Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРАМ, ПЕНЯМ, НАЛОГОВЫМ САНКЦИЯМ ВСЕГО В ТОМ ЧИСЛЕ:', 1010, 966783423, 470685208, 102647180, 12435295, 293731518, 292643494, 6239798, 6015450, 68066712, 122048711, 66194027, 50606745, 2990596, 2027362, 838342, 253419794, None, None, None, None, None, None]\n",
"['2009', '4nm010618', 'xlsx', 'Уточненные данные по состоянию на 16.07.2018\\n', 'из нее\\nзадолженность, невозможная к взысканию в связи с отложением подачи заявления в арбитражный суд о признании должника банкротом', 1011, 59789575, 45827009, 6993023, 772092, 33088224, 32865309, 70488, 41193, 5675274, 369196, 357281, 3524787, 92613, 75241, 41612, 9577077, None, None, None, None, None, None]\n",
"['2009', '4nm010618', 'xlsx', 'Уточненные данные по состоянию на 16.07.2018\\n', 'НЕДОИМКА', 1020, 738863445, 350780985, 76405303, 9214666, 221615329, 220703752, 5920609, 5756088, 46839744, 98789604, 54482333, 37990418, 896167, 605705, 190163, 195733775, None, None, None, None, None, None]\n"
]
}
],
"source": [
"# добавим функцию для того чтобы извлечь из имён файлов полезную информацию и потом добавить её в наш набор данных\n",
"# один файл выглядит так: 'data\\\\2009\\\\4nm010618.xlsx'\n",
"# сделаем функцию, которая принимает название файла в таком формате и регулярным выражением возвращает год, название файла и его расширение\n",
"\n",
"def parse_filename(filename):\n",
" rstring = r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)'\n",
" r = re.match(rstring, filename)\n",
" return r.groupdict()\n",
"\n",
"processor.filename_parser = parse_filename\n",
"for row in processor.get_sample(0, 4):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 0, '2009', '4nm010618', 'xlsx', 'P1', 'Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1093)', 1005, 1235462116, 613079956, 137068882, 16509931, 378521695, 377407864, 7475610, 7001696, 90013769, 152239095, 76851905, 52246651, 4059156, 2911250, 892339, 336093014, None, None, None, None, None, None]\n",
"[1, 1, '2009', '4nm010618', 'xlsx', 'P1', 'Уточненные данные по состоянию на 16.07.2018\\n', 'ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРАМ, ПЕНЯМ, НАЛОГОВЫМ САНКЦИЯМ ВСЕГО В ТОМ ЧИСЛЕ:', 1010, 966783423, 470685208, 102647180, 12435295, 293731518, 292643494, 6239798, 6015450, 68066712, 122048711, 66194027, 50606745, 2990596, 2027362, 838342, 253419794, None, None, None, None, None, None]\n",
"[2, 2, '2009', '4nm010618', 'xlsx', 'P1', 'Уточненные данные по состоянию на 16.07.2018\\n', 'из нее\\nзадолженность, невозможная к взысканию в связи с отложением подачи заявления в арбитражный суд о признании должника банкротом', 1011, 59789575, 45827009, 6993023, 772092, 33088224, 32865309, 70488, 41193, 5675274, 369196, 357281, 3524787, 92613, 75241, 41612, 9577077, None, None, None, None, None, None]\n",
"[3, 3, '2009', '4nm010618', 'xlsx', 'P1', 'Уточненные данные по состоянию на 16.07.2018\\n', 'НЕДОИМКА', 1020, 738863445, 350780985, 76405303, 9214666, 221615329, 220703752, 5920609, 5756088, 46839744, 98789604, 54482333, 37990418, 896167, 605705, 190163, 195733775, None, None, None, None, None, None]\n"
]
}
],
"source": [
"# теперь добавим в наш результируюший набор поле сквозного индекса\n",
"# и отдельный индекс по каждому рабочему листу, что нам попадётся\n",
"processor.global_index = True\n",
"processor.wb_index = True\n",
"# также нелишним будет добавить и колонку с заголовком рабочего листа с которого получены данные\n",
"processor.append_ws_title_column = True\n",
"for row in processor.get_sample(0, 4):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'global_index': 0, 'ws_index': 0, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n'}\n",
"{'global_index': 1, 'ws_index': 1, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n'}\n",
"{'global_index': 2, 'ws_index': 2, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n'}\n",
"{'global_index': 3, 'ws_index': 3, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n'}\n",
"{'global_index': 4, 'ws_index': 4, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n'}\n"
]
}
],
"source": [
"# посмотрим то же самое в формате набора словарей:\n",
"for row in islice(processor.get_iter_dict(), 0, 5):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'global_index': 0, 'ws_index': 0, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n', 'поле': 'ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1093)', 'код строки': 1005, 'кол. налогоплательщиков': 1235462116, 'всего': 613079956, 'по налогу (сбору)': 137068882, 'по пени': 16509931, 'по штрафам': 378521695, 'проценты': 377407864, 'по ЕСН': 7475610, 'по страховым взносам': 7001696}\n",
"{'global_index': 1, 'ws_index': 1, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n', 'поле': 'ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРАМ, ПЕНЯМ, НАЛОГОВЫМ САНКЦИЯМ ВСЕГО В ТОМ ЧИСЛЕ:', 'код строки': 1010, 'кол. налогоплательщиков': 966783423, 'всего': 470685208, 'по налогу (сбору)': 102647180, 'по пени': 12435295, 'по штрафам': 293731518, 'проценты': 292643494, 'по ЕСН': 6239798, 'по страховым взносам': 6015450}\n",
"{'global_index': 2, 'ws_index': 2, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n', 'поле': 'из нее\\nзадолженность, невозможная к взысканию в связи с отложением подачи заявления в арбитражный суд о признании должника банкротом', 'код строки': 1011, 'кол. налогоплательщиков': 59789575, 'всего': 45827009, 'по налогу (сбору)': 6993023, 'по пени': 772092, 'по штрафам': 33088224, 'проценты': 32865309, 'по ЕСН': 70488, 'по страховым взносам': 41193}\n",
"{'global_index': 3, 'ws_index': 3, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n', 'поле': 'НЕДОИМКА', 'код строки': 1020, 'кол. налогоплательщиков': 738863445, 'всего': 350780985, 'по налогу (сбору)': 76405303, 'по пени': 9214666, 'по штрафам': 221615329, 'проценты': 220703752, 'по ЕСН': 5920609, 'по страховым взносам': 5756088}\n",
"{'global_index': 4, 'ws_index': 4, 'year': '2009', 'filename': '4nm010618', 'extension': 'xlsx', 'ws_title': 'P1', 'date_ot': 'Уточненные данные по состоянию на 16.07.2018\\n', 'поле': '\\nИз строки 1020\\nНЕДОИМКА ОРГАНИЗАЦИЙ И ИНДИВИДУАЛЬНЫХ ПРЕДПРИНИМАТЕЛЕЙ, НЕ ПРЕДСТАВЛЯЮЩИХ ОТЧЕТНОСТЬ', 'код строки': 1030, 'кол. налогоплательщиков': 105337216, 'всего': 68231581, 'по налогу (сбору)': 16344932, 'по пени': 1759494, 'по штрафам': 47161217, 'проценты': 47056987, 'по ЕСН': 36418, 'по страховым взносам': 27229}\n"
]
}
],
"source": [
"# видно что не хватает колонок, т.к. не задан параметр \"headers\". исправим это\n",
"processor.headers = [\n",
" 'поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)',\n",
" 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'\n",
"]\n",
"for row in islice(processor.get_iter_dict(), 0, 5):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>global_index</th>\n",
" <th>ws_index</th>\n",
" <th>year</th>\n",
" <th>filename</th>\n",
" <th>extension</th>\n",
" <th>ws_title</th>\n",
" <th>date_ot</th>\n",
" <th>поле</th>\n",
" <th>код строки</th>\n",
" <th>кол. налогоплательщиков</th>\n",
" <th>всего</th>\n",
" <th>по налогу (сбору)</th>\n",
" <th>по пени</th>\n",
" <th>по штрафам</th>\n",
" <th>проценты</th>\n",
" <th>по ЕСН</th>\n",
" <th>по страховым взносам</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>P1</td>\n",
" <td>Уточненные данные по состоянию на 16.07.2018\\n</td>\n",
" <td>ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1...</td>\n",
" <td>1005</td>\n",
" <td>1235462116</td>\n",
" <td>613079956</td>\n",
" <td>137068882</td>\n",
" <td>16509931</td>\n",
" <td>378521695</td>\n",
" <td>377407864</td>\n",
" <td>7475610</td>\n",
" <td>7001696</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>P1</td>\n",
" <td>Уточненные данные по состоянию на 16.07.2018\\n</td>\n",
" <td>ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРА...</td>\n",
" <td>1010</td>\n",
" <td>966783423</td>\n",
" <td>470685208</td>\n",
" <td>102647180</td>\n",
" <td>12435295</td>\n",
" <td>293731518</td>\n",
" <td>292643494</td>\n",
" <td>6239798</td>\n",
" <td>6015450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>P1</td>\n",
" <td>Уточненные данные по состоянию на 16.07.2018\\n</td>\n",
" <td>из нее\\nзадолженность, невозможная к взысканию...</td>\n",
" <td>1011</td>\n",
" <td>59789575</td>\n",
" <td>45827009</td>\n",
" <td>6993023</td>\n",
" <td>772092</td>\n",
" <td>33088224</td>\n",
" <td>32865309</td>\n",
" <td>70488</td>\n",
" <td>41193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>P1</td>\n",
" <td>Уточненные данные по состоянию на 16.07.2018\\n</td>\n",
" <td>НЕДОИМКА</td>\n",
" <td>1020</td>\n",
" <td>738863445</td>\n",
" <td>350780985</td>\n",
" <td>76405303</td>\n",
" <td>9214666</td>\n",
" <td>221615329</td>\n",
" <td>220703752</td>\n",
" <td>5920609</td>\n",
" <td>5756088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>P1</td>\n",
" <td>Уточненные данные по состоянию на 16.07.2018\\n</td>\n",
" <td>\\nИз строки 1020\\nНЕДОИМКА ОРГАНИЗАЦИЙ И ИНДИВ...</td>\n",
" <td>1030</td>\n",
" <td>105337216</td>\n",
" <td>68231581</td>\n",
" <td>16344932</td>\n",
" <td>1759494</td>\n",
" <td>47161217</td>\n",
" <td>47056987</td>\n",
" <td>36418</td>\n",
" <td>27229</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" global_index ws_index year filename extension ws_title \\\n",
"0 0 0 2009 4nm010618 xlsx P1 \n",
"1 1 1 2009 4nm010618 xlsx P1 \n",
"2 2 2 2009 4nm010618 xlsx P1 \n",
"3 3 3 2009 4nm010618 xlsx P1 \n",
"4 4 4 2009 4nm010618 xlsx P1 \n",
"\n",
" date_ot \\\n",
"0 Уточненные данные по состоянию на 16.07.2018\\n \n",
"1 Уточненные данные по состоянию на 16.07.2018\\n \n",
"2 Уточненные данные по состоянию на 16.07.2018\\n \n",
"3 Уточненные данные по состоянию на 16.07.2018\\n \n",
"4 Уточненные данные по состоянию на 16.07.2018\\n \n",
"\n",
" поле код строки \\\n",
"0 ЗАДОЛЖЕННОСТЬ - ВСЕГО (стр.1010+стр.1045+стр.1... 1005 \n",
"1 ЗАДОЛЖЕННОСТЬ ПЕРЕД БЮДЖЕТОМ ПО НАЛОГАМ, СБОРА... 1010 \n",
"2 из нее\\nзадолженность, невозможная к взысканию... 1011 \n",
"3 НЕДОИМКА 1020 \n",
"4 \\nИз строки 1020\\nНЕДОИМКА ОРГАНИЗАЦИЙ И ИНДИВ... 1030 \n",
"\n",
" кол. налогоплательщиков всего по налогу (сбору) по пени \\\n",
"0 1235462116 613079956 137068882 16509931 \n",
"1 966783423 470685208 102647180 12435295 \n",
"2 59789575 45827009 6993023 772092 \n",
"3 738863445 350780985 76405303 9214666 \n",
"4 105337216 68231581 16344932 1759494 \n",
"\n",
" по штрафам проценты по ЕСН по страховым взносам \n",
"0 378521695 377407864 7475610 7001696 \n",
"1 293731518 292643494 6239798 6015450 \n",
"2 33088224 32865309 70488 41193 \n",
"3 221615329 220703752 5920609 5756088 \n",
"4 47161217 47056987 36418 27229 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# попробуем то же самое в виде DataFrame\n",
"processor.get_sample_dataframe(0, 5)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>global_index</th>\n",
" <th>ws_index</th>\n",
" <th>year</th>\n",
" <th>filename</th>\n",
" <th>extension</th>\n",
" <th>ws_title</th>\n",
" <th>form_name</th>\n",
" <th>description</th>\n",
" <th>form_dt</th>\n",
" <th>unit_name</th>\n",
" <th>поле</th>\n",
" <th>код строки</th>\n",
" <th>кол. налогоплательщиков</th>\n",
" <th>всего</th>\n",
" <th>по налогу (сбору)</th>\n",
" <th>по пени</th>\n",
" <th>по штрафам</th>\n",
" <th>проценты</th>\n",
" <th>по ЕСН</th>\n",
" <th>по страховым взносам</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>Р. Справочно1_Списание</td>\n",
" <td>Форма № 4-НМ</td>\n",
" <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
" <td>по состоянию на 01.06.2018 г.</td>\n",
" <td>тыс. рублей</td>\n",
" <td>Сумма списанной задолженности организаций, лик...</td>\n",
" <td>2400</td>\n",
" <td>23632</td>\n",
" <td>30705385</td>\n",
" <td>18388429</td>\n",
" <td>4699527</td>\n",
" <td>1439452</td>\n",
" <td>184237</td>\n",
" <td>596724</td>\n",
" <td>5397016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>Р. Справочно1_Списание</td>\n",
" <td>Форма № 4-НМ</td>\n",
" <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
" <td>по состоянию на 01.06.2018 г.</td>\n",
" <td>тыс. рублей</td>\n",
" <td>Сумма списанной задолженности индивидуальных п...</td>\n",
" <td>2405</td>\n",
" <td>9109</td>\n",
" <td>3013574</td>\n",
" <td>1661920</td>\n",
" <td>881213</td>\n",
" <td>242978</td>\n",
" <td>628</td>\n",
" <td>51928</td>\n",
" <td>174907</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>Р. Справочно1_Списание</td>\n",
" <td>Форма № 4-НМ</td>\n",
" <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
" <td>по состоянию на 01.06.2018 г.</td>\n",
" <td>тыс. рублей</td>\n",
" <td>Сумма списанной задолженности умерших или объя...</td>\n",
" <td>2410</td>\n",
" <td>39920</td>\n",
" <td>677495</td>\n",
" <td>115346</td>\n",
" <td>44047</td>\n",
" <td>11984</td>\n",
" <td>0</td>\n",
" <td>5203</td>\n",
" <td>500915</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>Р. Справочно1_Списание</td>\n",
" <td>Форма № 4-НМ</td>\n",
" <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
" <td>по состоянию на 01.06.2018 г.</td>\n",
" <td>тыс. рублей</td>\n",
" <td>Сумма списанной задолженности в случаях принят...</td>\n",
" <td>2415</td>\n",
" <td>58635</td>\n",
" <td>2161183</td>\n",
" <td>773472</td>\n",
" <td>556271</td>\n",
" <td>70216</td>\n",
" <td>5769</td>\n",
" <td>154600</td>\n",
" <td>600855</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2009</td>\n",
" <td>4nm010618</td>\n",
" <td>xlsx</td>\n",
" <td>Р. Справочно1_Списание</td>\n",
" <td>Форма № 4-НМ</td>\n",
" <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
" <td>по состоянию на 01.06.2018 г.</td>\n",
" <td>тыс. рублей</td>\n",
" <td>Сумма списанной задолженности по \"зависшим\" пл...</td>\n",
" <td>2420</td>\n",
" <td>93</td>\n",
" <td>43115</td>\n",
" <td>31946</td>\n",
" <td>752</td>\n",
" <td>76</td>\n",
" <td>0</td>\n",
" <td>5527</td>\n",
" <td>4814</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" global_index ws_index year filename extension ws_title \\\n",
"0 0 0 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
"1 1 1 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
"2 2 2 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
"3 3 3 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
"4 4 4 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
"\n",
" form_name description \\\n",
"0 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
"1 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
"2 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
"3 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
"4 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
"\n",
" form_dt unit_name \\\n",
"0 по состоянию на 01.06.2018 г. тыс. рублей \n",
"1 по состоянию на 01.06.2018 г. тыс. рублей \n",
"2 по состоянию на 01.06.2018 г. тыс. рублей \n",
"3 по состоянию на 01.06.2018 г. тыс. рублей \n",
"4 по состоянию на 01.06.2018 г. тыс. рублей \n",
"\n",
" поле код строки \\\n",
"0 Сумма списанной задолженности организаций, лик... 2400 \n",
"1 Сумма списанной задолженности индивидуальных п... 2405 \n",
"2 Сумма списанной задолженности умерших или объя... 2410 \n",
"3 Сумма списанной задолженности в случаях принят... 2415 \n",
"4 Сумма списанной задолженности по \"зависшим\" пл... 2420 \n",
"\n",
" кол. налогоплательщиков всего по налогу (сбору) по пени по штрафам \\\n",
"0 23632 30705385 18388429 4699527 1439452 \n",
"1 9109 3013574 1661920 881213 242978 \n",
"2 39920 677495 115346 44047 11984 \n",
"3 58635 2161183 773472 556271 70216 \n",
"4 93 43115 31946 752 76 \n",
"\n",
" проценты по ЕСН по страховым взносам \n",
"0 184237 596724 5397016 \n",
"1 628 51928 174907 \n",
"2 0 5203 500915 \n",
"3 5769 154600 600855 \n",
"4 0 5527 4814 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# пример как полностью настроить класс сразу при создании\n",
"proc = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx',\n",
" filename_parser=lambda filename: re.match(r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)', filename).groupdict(),\n",
" append_global_index=True, append_wb_index=True,\n",
" cells = {'form_name': 'J1', 'description': 'A2', 'form_dt': 'A3', 'unit_name': 'J4'},\n",
" headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
" first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=2\n",
")\n",
"proc.get_dataframe()[:5]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>global_index</th>\n",
" <th>ws_index</th>\n",
" <th>filename</th>\n",
" <th>ws_title</th>\n",
" <th>поле</th>\n",
" <th>код строки</th>\n",
" <th>кол. налогоплательщиков</th>\n",
" <th>всего</th>\n",
" <th>по налогу (сбору)</th>\n",
" <th>по пени</th>\n",
" <th>по штрафам</th>\n",
" <th>проценты</th>\n",
" <th>по ЕСН</th>\n",
" <th>по страховым взносам</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>федеральным налогам и сборам</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>Всего</td>\n",
" <td>из них</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>налог на прибыль организаций</td>\n",
" <td>None</td>\n",
" <td>налог на добавленную стоимость</td>\n",
" <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
" <td>платежи за пользование природными ресурсами</td>\n",
" <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
" <td>остальные федеральные налоги и сборы</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>Всего</td>\n",
" <td>в том числе в федеральный бюджет</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>А</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>195</td>\n",
" <td>195</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>196</td>\n",
" <td>196</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>197</td>\n",
" <td>197</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>198</td>\n",
" <td>198</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>199</td>\n",
" <td>199</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" global_index ws_index \\\n",
"0 0 0 \n",
"1 1 1 \n",
"2 2 2 \n",
"3 3 3 \n",
"4 4 4 \n",
".. ... ... \n",
"195 195 195 \n",
"196 196 196 \n",
"197 197 197 \n",
"198 198 198 \n",
"199 199 199 \n",
"\n",
" filename ws_title поле \\\n",
"0 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"1 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"2 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"3 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"4 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 А \n",
".. ... ... ... \n",
"195 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"196 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"197 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"198 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"199 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
"\n",
" код строки кол. налогоплательщиков всего \\\n",
"0 NaN федеральным налогам и сборам None \n",
"1 NaN Всего из них \n",
"2 NaN None налог на прибыль организаций \n",
"3 NaN None Всего \n",
"4 1.0 2 3 \n",
".. ... ... ... \n",
"195 NaN None None \n",
"196 NaN None None \n",
"197 NaN None None \n",
"198 NaN None None \n",
"199 NaN None None \n",
"\n",
" по налогу (сбору) по пени \\\n",
"0 None None \n",
"1 None None \n",
"2 None налог на добавленную стоимость \n",
"3 в том числе в федеральный бюджет None \n",
"4 4 5 \n",
".. ... ... \n",
"195 None None \n",
"196 None None \n",
"197 None None \n",
"198 None None \n",
"199 None None \n",
"\n",
" по штрафам \\\n",
"0 None \n",
"1 None \n",
"2 из графы 5 налог на добавленную стоимость по т... \n",
"3 None \n",
"4 6 \n",
".. ... \n",
"195 None \n",
"196 None \n",
"197 None \n",
"198 None \n",
"199 None \n",
"\n",
" проценты \\\n",
"0 None \n",
"1 None \n",
"2 платежи за пользование природными ресурсами \n",
"3 None \n",
"4 7 \n",
".. ... \n",
"195 None \n",
"196 None \n",
"197 None \n",
"198 None \n",
"199 None \n",
"\n",
" по ЕСН \\\n",
"0 None \n",
"1 None \n",
"2 из графы 7\\n налог на добычу полезных ископаемых \n",
"3 None \n",
"4 8 \n",
".. ... \n",
"195 None \n",
"196 None \n",
"197 None \n",
"198 None \n",
"199 None \n",
"\n",
" по страховым взносам \n",
"0 None \n",
"1 None \n",
"2 остальные федеральные налоги и сборы \n",
"3 None \n",
"4 9 \n",
".. ... \n",
"195 None \n",
"196 None \n",
"197 None \n",
"198 None \n",
"199 None \n",
"\n",
"[200 rows x 14 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# пробуем извлечь данные из архива, и только один файл\n",
"proc = ExcelPreprocessor('https://www.nalog.ru/html/sites/www.new.nalog.ru/docs/otchet/4nm011018reg.zip',\n",
" filename_parser=lambda filename: {'filename': filename},\n",
" append_global_index=True, append_wb_index=True,\n",
" headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
" first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=0, in_archive_re_mask=r'.*Раздел( )?1.*\\.xlsx'\n",
")\n",
"proc.get_sample_dataframe(0, 200)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>global_index</th>\n",
" <th>ws_index</th>\n",
" <th>filename</th>\n",
" <th>ws_title</th>\n",
" <th>поле</th>\n",
" <th>код строки</th>\n",
" <th>кол. налогоплательщиков</th>\n",
" <th>всего</th>\n",
" <th>по налогу (сбору)</th>\n",
" <th>по пени</th>\n",
" <th>по штрафам</th>\n",
" <th>проценты</th>\n",
" <th>по ЕСН</th>\n",
" <th>по страховым взносам</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>региональным налогам и сборам</td>\n",
" <td>местным налогам и сборам</td>\n",
" <td>налогам со специальным налоговым режимом</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>из них</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>налог на добавленную стоимость</td>\n",
" <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
" <td>платежи за пользование природными ресурсами</td>\n",
" <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
" <td>остальные федеральные налоги и сборы</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>в том числе в федеральный бюджет</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7.0</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>95</td>\n",
" <td>95</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>3060920</td>\n",
" <td>1273911</td>\n",
" <td>76849</td>\n",
" <td>9384</td>\n",
" <td>989177</td>\n",
" <td>989065</td>\n",
" <td>16638.0</td>\n",
" <td>877</td>\n",
" <td>191247</td>\n",
" <td>525892</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>96</td>\n",
" <td>96</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>1118695</td>\n",
" <td>531438</td>\n",
" <td>121488</td>\n",
" <td>13819</td>\n",
" <td>249470</td>\n",
" <td>249469</td>\n",
" <td>31627.0</td>\n",
" <td>29256</td>\n",
" <td>128853</td>\n",
" <td>93277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>97</td>\n",
" <td>97</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>4404357</td>\n",
" <td>2091467</td>\n",
" <td>458202</td>\n",
" <td>72443</td>\n",
" <td>1110375</td>\n",
" <td>1110306</td>\n",
" <td>20647.0</td>\n",
" <td>4214</td>\n",
" <td>502243</td>\n",
" <td>606823</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>98</td>\n",
" <td>98</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>540981</td>\n",
" <td>120118</td>\n",
" <td>14430</td>\n",
" <td>1406</td>\n",
" <td>58193</td>\n",
" <td>58037</td>\n",
" <td>2554.0</td>\n",
" <td>59</td>\n",
" <td>44941</td>\n",
" <td>81699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>99</td>\n",
" <td>99</td>\n",
" <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
" <td>1005</td>\n",
" <td>61052</td>\n",
" <td>44624</td>\n",
" <td>10852</td>\n",
" <td>1076</td>\n",
" <td>25804</td>\n",
" <td>25804</td>\n",
" <td>1336.0</td>\n",
" <td>857</td>\n",
" <td>6632</td>\n",
" <td>4049</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" global_index ws_index filename \\\n",
"0 0 0 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"1 1 1 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"2 2 2 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"3 3 3 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"4 4 4 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
".. ... ... ... \n",
"95 95 95 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"96 96 96 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"97 97 97 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"98 98 98 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"99 99 99 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"\n",
" ws_title поле \\\n",
"0 1005 None \n",
"1 1005 из них \n",
"2 1005 None \n",
"3 1005 в том числе в федеральный бюджет \n",
"4 1005 1 \n",
".. ... ... \n",
"95 1005 3060920 \n",
"96 1005 1118695 \n",
"97 1005 4404357 \n",
"98 1005 540981 \n",
"99 1005 61052 \n",
"\n",
" код строки \\\n",
"0 None \n",
"1 None \n",
"2 налог на добавленную стоимость \n",
"3 None \n",
"4 2 \n",
".. ... \n",
"95 1273911 \n",
"96 531438 \n",
"97 2091467 \n",
"98 120118 \n",
"99 44624 \n",
"\n",
" кол. налогоплательщиков \\\n",
"0 None \n",
"1 None \n",
"2 из графы 5 налог на добавленную стоимость по т... \n",
"3 None \n",
"4 3 \n",
".. ... \n",
"95 76849 \n",
"96 121488 \n",
"97 458202 \n",
"98 14430 \n",
"99 10852 \n",
"\n",
" всего \\\n",
"0 None \n",
"1 None \n",
"2 платежи за пользование природными ресурсами \n",
"3 None \n",
"4 4 \n",
".. ... \n",
"95 9384 \n",
"96 13819 \n",
"97 72443 \n",
"98 1406 \n",
"99 1076 \n",
"\n",
" по налогу (сбору) \\\n",
"0 None \n",
"1 None \n",
"2 из графы 7\\n налог на добычу полезных ископаемых \n",
"3 None \n",
"4 5 \n",
".. ... \n",
"95 989177 \n",
"96 249470 \n",
"97 1110375 \n",
"98 58193 \n",
"99 25804 \n",
"\n",
" по пени по штрафам \\\n",
"0 None NaN \n",
"1 None NaN \n",
"2 остальные федеральные налоги и сборы NaN \n",
"3 None NaN \n",
"4 6 7.0 \n",
".. ... ... \n",
"95 989065 16638.0 \n",
"96 249469 31627.0 \n",
"97 1110306 20647.0 \n",
"98 58037 2554.0 \n",
"99 25804 1336.0 \n",
"\n",
" проценты по ЕСН \\\n",
"0 региональным налогам и сборам местным налогам и сборам \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 8 9 \n",
".. ... ... \n",
"95 877 191247 \n",
"96 29256 128853 \n",
"97 4214 502243 \n",
"98 59 44941 \n",
"99 857 6632 \n",
"\n",
" по страховым взносам \n",
"0 налогам со специальным налоговым режимом \n",
"1 None \n",
"2 None \n",
"3 None \n",
"4 10 \n",
".. ... \n",
"95 525892 \n",
"96 93277 \n",
"97 606823 \n",
"98 81699 \n",
"99 4049 \n",
"\n",
"[100 rows x 14 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# видим много пустых строк (без значений). Отфильтруем только те где есть хотя бы одно значение\n",
"proc.filter_empty_cells = True\n",
"proc.get_sample_dataframe(0, 200)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}