git / code.ach.gov.ru / gavrin / jupyter_snippet
commit 70fe216e22369347f436f9a443821b0783621b04
author Дмитрий Сергеевич Гаврин <gavrin_ds@ach.gov.ru>
date 2020-04-02 14:54:40 +0300
parents 70536840
message
добавлена фильтрация пустых ячеек
files
| file | add | del |
|---|---|---|
| ExcelPreprocessor.ipynb | +739 | -350 |
patch
diff --git a/ExcelPreprocessor.ipynb b/ExcelPreprocessor.ipynb
index fa3fe7e564758f134ab02415dc2617b951626194..02ca3a3f9bf1aaa43004ab62c24fc408d7b053ce 100644
--- a/ExcelPreprocessor.ipynb
+++ b/ExcelPreprocessor.ipynb
@@ -47,9 +47,9 @@ "Requirement already satisfied: openpyxl in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.3)\n",
"Requirement already satisfied: xlrd in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (1.2.0)\n",
"Requirement already satisfied: chardet in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.4)\n",
"Requirement already satisfied: rarfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.1)\n",
+ "Requirement already satisfied: numpy>=1.13.3 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (1.18.1)\n",
"Requirement already satisfied: pytz>=2017.2 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2019.3)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2.8.1)\n",
- "Requirement already satisfied: numpy>=1.13.3 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (1.18.1)\n",
"Requirement already satisfied: jdcal in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.4.1)\n",
"Requirement already satisfied: et_xmlfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.0.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from python-dateutil>=2.6.1->pandas) (1.14.0)\n"
@@ -96,7 +96,7 @@ "source": [
"class ExcelPreprocessor:\n",
" def __init__(self, *wb, append_global_index=False, append_wb_index=False, \n",
" filename_parser=None, append_ws_title_column=False, first_row_number=1,\n",
- " iterate_over_worksheet=0, cells=None, headers=None, in_archive_re_mask=None):\n",
+ " iterate_over_worksheet=0, cells=None, headers=None, in_archive_re_mask=None, filter_empty_cells=False):\n",
" '''\n",
" параметр iterate_over_worksheet отвечает за то, какой именно рабочий лист в каждой прочитанной\n",
" книге будет разобран по умолчанию это лист с индексом 0, т.е. 1 в списке.\n",
@@ -122,6 +122,7 @@ " self.iterate_over_worksheet = iterate_over_worksheet\n",
" self.cells = cells\n",
" self.headers = headers\n",
" self.in_archive_re_mask = in_archive_re_mask\n",
+ " self.filter_empty_cells = filter_empty_cells\n",
"\n",
" def __repr__(self):\n",
" return f'''{ExcelPreprocessor.__name__}{self.wb}:\n",
@@ -156,7 +157,6 @@ " from zipfile import ZipFile\n",
" import chardet\n",
" ZIP_FILENAME_UTF8_FLAG = 0x800\n",
" zf = ZipFile(rf , 'r')\n",
- " print(filename)\n",
" for info in zf.filelist:\n",
" zfilename = info.filename\n",
" if not zfilename.endswith('.xlsx'):\n",
@@ -170,14 +170,6 @@ " if not re.match(self.in_archive_re_mask, zfilename):\n",
" continue\n",
" yield str(Path(filename) / Path(zfilename)), zf.open(info.filename, 'r')\n",
"\n",
- " if filename.endswith('.rar'):\n",
- " from rarfile import RarFile\n",
- " rf = rarfile.RarFile(rar_fn, 'r')\n",
- " for f in rf.infolist():\n",
- " print(f.filename, f.file_size)\n",
- " if f.filename == 'README':\n",
- " print(rf.read(f))\n",
- "\n",
" @property\n",
" def _columns(self):\n",
" columns = []\n",
@@ -212,7 +204,10 @@ " iter_sheet = [i for i, ws in enumerate(wb.worksheets) if not ws.title.startswith('hidden')]\n",
"\n",
" for ws_index in iter_sheet:\n",
" ws = wb.worksheets[ws_index]\n",
- " result = ((cell.value for cell in row) for row in ws.rows)\n",
+ " if self.filter_empty_cells:\n",
+ " result = ((cell.value for cell in row) for row in ws.rows if any(cell.value for cell in row))\n",
+ " else:\n",
+ " result = ((cell.value for cell in row) for row in ws.rows)\n",
"\n",
" if self.first_row_number:\n",
" result = islice(result, self.first_row_number - 1, None)\n",
@@ -471,7 +466,7 @@ ]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -656,7 +651,7 @@ "3 221615329 220703752 5920609 5756088 \n",
"4 47161217 47056987 36418 27229 "
]
},
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -668,7 +663,7 @@ ]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -830,371 +825,771 @@ " <td>0</td>\n",
" <td>5527</td>\n",
" <td>4814</td>\n",
" </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " global_index ws_index year filename extension ws_title \\\n",
+ "0 0 0 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
+ "1 1 1 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
+ "2 2 2 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
+ "3 3 3 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
+ "4 4 4 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
+ "\n",
+ " form_name description \\\n",
+ "0 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ "1 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ "2 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ "3 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ "4 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ "\n",
+ " form_dt unit_name \\\n",
+ "0 по состоянию на 01.06.2018 г. тыс. рублей \n",
+ "1 по состоянию на 01.06.2018 г. тыс. рублей \n",
+ "2 по состоянию на 01.06.2018 г. тыс. рублей \n",
+ "3 по состоянию на 01.06.2018 г. тыс. рублей \n",
+ "4 по состоянию на 01.06.2018 г. тыс. рублей \n",
+ "\n",
+ " поле код строки \\\n",
+ "0 Сумма списанной задолженности организаций, лик... 2400 \n",
+ "1 Сумма списанной задолженности индивидуальных п... 2405 \n",
+ "2 Сумма списанной задолженности умерших или объя... 2410 \n",
+ "3 Сумма списанной задолженности в случаях принят... 2415 \n",
+ "4 Сумма списанной задолженности по \"зависшим\" пл... 2420 \n",
+ "\n",
+ " кол. налогоплательщиков всего по налогу (сбору) по пени по штрафам \\\n",
+ "0 23632 30705385 18388429 4699527 1439452 \n",
+ "1 9109 3013574 1661920 881213 242978 \n",
+ "2 39920 677495 115346 44047 11984 \n",
+ "3 58635 2161183 773472 556271 70216 \n",
+ "4 93 43115 31946 752 76 \n",
+ "\n",
+ " проценты по ЕСН по страховым взносам \n",
+ "0 184237 596724 5397016 \n",
+ "1 628 51928 174907 \n",
+ "2 0 5203 500915 \n",
+ "3 5769 154600 600855 \n",
+ "4 0 5527 4814 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# пример как полностью настроить класс сразу при создании\n",
+ "proc = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx',\n",
+ " filename_parser=lambda filename: re.match(r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)', filename).groupdict(),\n",
+ " append_global_index=True, append_wb_index=True,\n",
+ " cells = {'form_name': 'J1', 'description': 'A2', 'form_dt': 'A3', 'unit_name': 'J4'},\n",
+ " headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
+ " first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=2\n",
+ ")\n",
+ "proc.get_dataframe()[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>global_index</th>\n",
+ " <th>ws_index</th>\n",
+ " <th>filename</th>\n",
+ " <th>ws_title</th>\n",
+ " <th>поле</th>\n",
+ " <th>код строки</th>\n",
+ " <th>кол. налогоплательщиков</th>\n",
+ " <th>всего</th>\n",
+ " <th>по налогу (сбору)</th>\n",
+ " <th>по пени</th>\n",
+ " <th>по штрафам</th>\n",
+ " <th>проценты</th>\n",
+ " <th>по ЕСН</th>\n",
+ " <th>по страховым взносам</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
" <tr>\n",
- " <th>5</th>\n",
- " <td>5</td>\n",
- " <td>5</td>\n",
- " <td>2009</td>\n",
- " <td>4nm010618</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.06.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности организаций, отв...</td>\n",
- " <td>2425</td>\n",
- " <td>25104</td>\n",
- " <td>4038755</td>\n",
- " <td>2487632</td>\n",
- " <td>1003525</td>\n",
- " <td>215253</td>\n",
- " <td>х</td>\n",
- " <td>29516</td>\n",
- " <td>302829</td>\n",
+ " <th>0</th>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>федеральным налогам и сборам</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>6</th>\n",
- " <td>6</td>\n",
- " <td>6</td>\n",
- " <td>2009</td>\n",
- " <td>4nm010618</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.06.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности по решениям нало...</td>\n",
- " <td>2430</td>\n",
- " <td>16841363</td>\n",
- " <td>60557919</td>\n",
- " <td>41458668</td>\n",
- " <td>19050086</td>\n",
- " <td>325</td>\n",
+ " <th>1</th>\n",
" <td>1</td>\n",
- " <td>0</td>\n",
- " <td>48839</td>\n",
+ " <td>1</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>Всего</td>\n",
+ " <td>из них</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>7</th>\n",
+ " <th>2</th>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>налог на прибыль организаций</td>\n",
+ " <td>None</td>\n",
+ " <td>налог на добавленную стоимость</td>\n",
+ " <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
+ " <td>платежи за пользование природными ресурсами</td>\n",
+ " <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
+ " <td>остальные федеральные налоги и сборы</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>3</td>\n",
+ " <td>3</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>Всего</td>\n",
+ " <td>в том числе в федеральный бюджет</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>4</td>\n",
+ " <td>4</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>А</td>\n",
+ " <td>1.0</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
" <td>7</td>\n",
- " <td>7</td>\n",
- " <td>2009</td>\n",
- " <td>4nm010618</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.06.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма задолженности, списанной на основании з...</td>\n",
- " <td>2435</td>\n",
- " <td>1799585</td>\n",
- " <td>133435115</td>\n",
- " <td>36498879</td>\n",
- " <td>19197715</td>\n",
- " <td>5102099</td>\n",
- " <td>12731</td>\n",
- " <td>2055653</td>\n",
- " <td>70568038</td>\n",
+ " <td>8</td>\n",
+ " <td>9</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>195</th>\n",
+ " <td>195</td>\n",
+ " <td>195</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>8</th>\n",
- " <td>8</td>\n",
+ " <th>196</th>\n",
+ " <td>196</td>\n",
+ " <td>196</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>197</th>\n",
+ " <td>197</td>\n",
+ " <td>197</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>198</th>\n",
+ " <td>198</td>\n",
+ " <td>198</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>199</th>\n",
+ " <td>199</td>\n",
+ " <td>199</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>200 rows × 14 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " global_index ws_index \\\n",
+ "0 0 0 \n",
+ "1 1 1 \n",
+ "2 2 2 \n",
+ "3 3 3 \n",
+ "4 4 4 \n",
+ ".. ... ... \n",
+ "195 195 195 \n",
+ "196 196 196 \n",
+ "197 197 197 \n",
+ "198 198 198 \n",
+ "199 199 199 \n",
+ "\n",
+ " filename ws_title поле \\\n",
+ "0 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "1 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "2 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "3 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "4 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 А \n",
+ ".. ... ... ... \n",
+ "195 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "196 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "197 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "198 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "199 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... 1005 None \n",
+ "\n",
+ " код строки кол. налогоплательщиков всего \\\n",
+ "0 NaN федеральным налогам и сборам None \n",
+ "1 NaN Всего из них \n",
+ "2 NaN None налог на прибыль организаций \n",
+ "3 NaN None Всего \n",
+ "4 1.0 2 3 \n",
+ ".. ... ... ... \n",
+ "195 NaN None None \n",
+ "196 NaN None None \n",
+ "197 NaN None None \n",
+ "198 NaN None None \n",
+ "199 NaN None None \n",
+ "\n",
+ " по налогу (сбору) по пени \\\n",
+ "0 None None \n",
+ "1 None None \n",
+ "2 None налог на добавленную стоимость \n",
+ "3 в том числе в федеральный бюджет None \n",
+ "4 4 5 \n",
+ ".. ... ... \n",
+ "195 None None \n",
+ "196 None None \n",
+ "197 None None \n",
+ "198 None None \n",
+ "199 None None \n",
+ "\n",
+ " по штрафам \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 из графы 5 налог на добавленную стоимость по т... \n",
+ "3 None \n",
+ "4 6 \n",
+ ".. ... \n",
+ "195 None \n",
+ "196 None \n",
+ "197 None \n",
+ "198 None \n",
+ "199 None \n",
+ "\n",
+ " проценты \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 платежи за пользование природными ресурсами \n",
+ "3 None \n",
+ "4 7 \n",
+ ".. ... \n",
+ "195 None \n",
+ "196 None \n",
+ "197 None \n",
+ "198 None \n",
+ "199 None \n",
+ "\n",
+ " по ЕСН \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 из графы 7\\n налог на добычу полезных ископаемых \n",
+ "3 None \n",
+ "4 8 \n",
+ ".. ... \n",
+ "195 None \n",
+ "196 None \n",
+ "197 None \n",
+ "198 None \n",
+ "199 None \n",
+ "\n",
+ " по страховым взносам \n",
+ "0 None \n",
+ "1 None \n",
+ "2 остальные федеральные налоги и сборы \n",
+ "3 None \n",
+ "4 9 \n",
+ ".. ... \n",
+ "195 None \n",
+ "196 None \n",
+ "197 None \n",
+ "198 None \n",
+ "199 None \n",
+ "\n",
+ "[200 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# пробуем извлечь данные из архива, и только один файл\n",
+ "proc = ExcelPreprocessor('https://www.nalog.ru/html/sites/www.new.nalog.ru/docs/otchet/4nm011018reg.zip',\n",
+ " filename_parser=lambda filename: {'filename': filename},\n",
+ " append_global_index=True, append_wb_index=True,\n",
+ " headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
+ " first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=0, in_archive_re_mask=r'.*Раздел( )?1.*\\.xlsx'\n",
+ ")\n",
+ "proc.get_sample_dataframe(0, 200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>global_index</th>\n",
+ " <th>ws_index</th>\n",
+ " <th>filename</th>\n",
+ " <th>ws_title</th>\n",
+ " <th>поле</th>\n",
+ " <th>код строки</th>\n",
+ " <th>кол. налогоплательщиков</th>\n",
+ " <th>всего</th>\n",
+ " <th>по налогу (сбору)</th>\n",
+ " <th>по пени</th>\n",
+ " <th>по штрафам</th>\n",
+ " <th>проценты</th>\n",
+ " <th>по ЕСН</th>\n",
+ " <th>по страховым взносам</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
" <td>0</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности организаций, лик...</td>\n",
- " <td>2400</td>\n",
- " <td>40157</td>\n",
- " <td>71402906</td>\n",
- " <td>40477282</td>\n",
- " <td>12510111</td>\n",
- " <td>3381043</td>\n",
- " <td>838205</td>\n",
- " <td>1228143</td>\n",
- " <td>12968122</td>\n",
+ " <td>0</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>федеральным налогам и сборам</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>9</th>\n",
- " <td>9</td>\n",
+ " <th>1</th>\n",
" <td>1</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности индивидуальных п...</td>\n",
- " <td>2405</td>\n",
- " <td>19496</td>\n",
- " <td>4481612</td>\n",
- " <td>2347431</td>\n",
- " <td>1261980</td>\n",
- " <td>345253</td>\n",
- " <td>628</td>\n",
- " <td>72728</td>\n",
- " <td>453592</td>\n",
+ " <td>1</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>Всего</td>\n",
+ " <td>из них</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>10</th>\n",
- " <td>10</td>\n",
+ " <th>2</th>\n",
" <td>2</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности умерших или объя...</td>\n",
- " <td>2410</td>\n",
- " <td>107516</td>\n",
- " <td>1397855</td>\n",
- " <td>318751</td>\n",
- " <td>93790</td>\n",
- " <td>17866</td>\n",
- " <td>0</td>\n",
- " <td>8821</td>\n",
- " <td>958627</td>\n",
+ " <td>2</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>налог на прибыль организаций</td>\n",
+ " <td>None</td>\n",
+ " <td>налог на добавленную стоимость</td>\n",
+ " <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
+ " <td>платежи за пользование природными ресурсами</td>\n",
+ " <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
+ " <td>остальные федеральные налоги и сборы</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>11</th>\n",
- " <td>11</td>\n",
+ " <th>3</th>\n",
" <td>3</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности в случаях принят...</td>\n",
- " <td>2415</td>\n",
- " <td>152898</td>\n",
- " <td>5175662</td>\n",
- " <td>1879757</td>\n",
- " <td>1448942</td>\n",
- " <td>185201</td>\n",
- " <td>6911</td>\n",
- " <td>255959</td>\n",
- " <td>1398892</td>\n",
+ " <td>3</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>None</td>\n",
+ " <td>NaN</td>\n",
+ " <td>None</td>\n",
+ " <td>Всего</td>\n",
+ " <td>в том числе в федеральный бюджет</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>12</th>\n",
- " <td>12</td>\n",
+ " <th>4</th>\n",
" <td>4</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности по \"зависшим\" пл...</td>\n",
- " <td>2420</td>\n",
- " <td>165</td>\n",
- " <td>79701</td>\n",
- " <td>64933</td>\n",
- " <td>1515</td>\n",
- " <td>70</td>\n",
- " <td>0</td>\n",
- " <td>5565</td>\n",
- " <td>7618</td>\n",
+ " <td>4</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>А</td>\n",
+ " <td>1.0</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>7</td>\n",
+ " <td>8</td>\n",
+ " <td>9</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>13</th>\n",
- " <td>13</td>\n",
- " <td>5</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности организаций, отв...</td>\n",
- " <td>2425</td>\n",
- " <td>40759</td>\n",
- " <td>7379705</td>\n",
- " <td>4476555</td>\n",
- " <td>1843784</td>\n",
- " <td>437216</td>\n",
- " <td>х</td>\n",
- " <td>73047</td>\n",
- " <td>549103</td>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>14</th>\n",
- " <td>14</td>\n",
- " <td>6</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма списанной задолженности по решениям нало...</td>\n",
- " <td>2430</td>\n",
- " <td>16980357</td>\n",
- " <td>61711637</td>\n",
- " <td>42232603</td>\n",
- " <td>19477495</td>\n",
- " <td>676</td>\n",
- " <td>725</td>\n",
- " <td>0</td>\n",
- " <td>138</td>\n",
+ " <th>95</th>\n",
+ " <td>95</td>\n",
+ " <td>95</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>Камчатский край</td>\n",
+ " <td>3060920.0</td>\n",
+ " <td>1273911</td>\n",
+ " <td>76849</td>\n",
+ " <td>9384</td>\n",
+ " <td>989177</td>\n",
+ " <td>989065</td>\n",
+ " <td>16638</td>\n",
+ " <td>877</td>\n",
+ " <td>191247</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>15</th>\n",
- " <td>15</td>\n",
- " <td>7</td>\n",
- " <td>2008</td>\n",
- " <td>4nm011118</td>\n",
- " <td>xlsx</td>\n",
- " <td>Р. Справочно1_Списание</td>\n",
- " <td>Форма № 4-НМ</td>\n",
- " <td>Справочно к Разделам I, II: 1_Списано зад...</td>\n",
- " <td>по состоянию на 01.11.2018 г.</td>\n",
- " <td>тыс. рублей</td>\n",
- " <td>Сумма задолженности, списанной на основании з...</td>\n",
- " <td>2435</td>\n",
- " <td>3737334</td>\n",
- " <td>340440384</td>\n",
- " <td>62488024</td>\n",
- " <td>31036806</td>\n",
- " <td>7336091</td>\n",
- " <td>34915</td>\n",
- " <td>2494914</td>\n",
- " <td>237049634</td>\n",
+ " <th>96</th>\n",
+ " <td>96</td>\n",
+ " <td>96</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>Магаданская область</td>\n",
+ " <td>1118695.0</td>\n",
+ " <td>531438</td>\n",
+ " <td>121488</td>\n",
+ " <td>13819</td>\n",
+ " <td>249470</td>\n",
+ " <td>249469</td>\n",
+ " <td>31627</td>\n",
+ " <td>29256</td>\n",
+ " <td>128853</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>97</th>\n",
+ " <td>97</td>\n",
+ " <td>97</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>Сахалинская область</td>\n",
+ " <td>4404357.0</td>\n",
+ " <td>2091467</td>\n",
+ " <td>458202</td>\n",
+ " <td>72443</td>\n",
+ " <td>1110375</td>\n",
+ " <td>1110306</td>\n",
+ " <td>20647</td>\n",
+ " <td>4214</td>\n",
+ " <td>502243</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>98</th>\n",
+ " <td>98</td>\n",
+ " <td>98</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>Еврейская автономная область</td>\n",
+ " <td>540981.0</td>\n",
+ " <td>120118</td>\n",
+ " <td>14430</td>\n",
+ " <td>1406</td>\n",
+ " <td>58193</td>\n",
+ " <td>58037</td>\n",
+ " <td>2554</td>\n",
+ " <td>59</td>\n",
+ " <td>44941</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>99</th>\n",
+ " <td>99</td>\n",
+ " <td>99</td>\n",
+ " <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+ " <td>1005</td>\n",
+ " <td>Чукотский АО</td>\n",
+ " <td>61052.0</td>\n",
+ " <td>44624</td>\n",
+ " <td>10852</td>\n",
+ " <td>1076</td>\n",
+ " <td>25804</td>\n",
+ " <td>25804</td>\n",
+ " <td>1336</td>\n",
+ " <td>857</td>\n",
+ " <td>6632</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
+ "<p>100 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
- " global_index ws_index year filename extension ws_title \\\n",
- "0 0 0 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "1 1 1 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "2 2 2 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "3 3 3 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "4 4 4 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "5 5 5 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "6 6 6 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "7 7 7 2009 4nm010618 xlsx Р. Справочно1_Списание \n",
- "8 8 0 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "9 9 1 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "10 10 2 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "11 11 3 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "12 12 4 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "13 13 5 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "14 14 6 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
- "15 15 7 2008 4nm011118 xlsx Р. Справочно1_Списание \n",
+ " global_index ws_index filename \\\n",
+ "0 0 0 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "1 1 1 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "2 2 2 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "3 3 3 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "4 4 4 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ ".. ... ... ... \n",
+ "95 95 95 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "96 96 96 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "97 97 97 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "98 98 98 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
+ "99 99 99 https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r... \n",
"\n",
- " form_name description \\\n",
- "0 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "1 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "2 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "3 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "4 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "5 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "6 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "7 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "8 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "9 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "10 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "11 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "12 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "13 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "14 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
- "15 Форма № 4-НМ Справочно к Разделам I, II: 1_Списано зад... \n",
+ " ws_title поле код строки \\\n",
+ "0 1005 None NaN \n",
+ "1 1005 None NaN \n",
+ "2 1005 None NaN \n",
+ "3 1005 None NaN \n",
+ "4 1005 А 1.0 \n",
+ ".. ... ... ... \n",
+ "95 1005 Камчатский край 3060920.0 \n",
+ "96 1005 Магаданская область 1118695.0 \n",
+ "97 1005 Сахалинская область 4404357.0 \n",
+ "98 1005 Еврейская автономная область 540981.0 \n",
+ "99 1005 Чукотский АО 61052.0 \n",
"\n",
- " form_dt unit_name \\\n",
- "0 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "1 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "2 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "3 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "4 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "5 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "6 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "7 по состоянию на 01.06.2018 г. тыс. рублей \n",
- "8 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "9 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "10 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "11 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "12 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "13 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "14 по состоянию на 01.11.2018 г. тыс. рублей \n",
- "15 по состоянию на 01.11.2018 г. тыс. рублей \n",
+ " кол. налогоплательщиков всего \\\n",
+ "0 федеральным налогам и сборам None \n",
+ "1 Всего из них \n",
+ "2 None налог на прибыль организаций \n",
+ "3 None Всего \n",
+ "4 2 3 \n",
+ ".. ... ... \n",
+ "95 1273911 76849 \n",
+ "96 531438 121488 \n",
+ "97 2091467 458202 \n",
+ "98 120118 14430 \n",
+ "99 44624 10852 \n",
"\n",
- " поле код строки \\\n",
- "0 Сумма списанной задолженности организаций, лик... 2400 \n",
- "1 Сумма списанной задолженности индивидуальных п... 2405 \n",
- "2 Сумма списанной задолженности умерших или объя... 2410 \n",
- "3 Сумма списанной задолженности в случаях принят... 2415 \n",
- "4 Сумма списанной задолженности по \"зависшим\" пл... 2420 \n",
- "5 Сумма списанной задолженности организаций, отв... 2425 \n",
- "6 Сумма списанной задолженности по решениям нало... 2430 \n",
- "7 Сумма задолженности, списанной на основании з... 2435 \n",
- "8 Сумма списанной задолженности организаций, лик... 2400 \n",
- "9 Сумма списанной задолженности индивидуальных п... 2405 \n",
- "10 Сумма списанной задолженности умерших или объя... 2410 \n",
- "11 Сумма списанной задолженности в случаях принят... 2415 \n",
- "12 Сумма списанной задолженности по \"зависшим\" пл... 2420 \n",
- "13 Сумма списанной задолженности организаций, отв... 2425 \n",
- "14 Сумма списанной задолженности по решениям нало... 2430 \n",
- "15 Сумма задолженности, списанной на основании з... 2435 \n",
+ " по налогу (сбору) по пени \\\n",
+ "0 None None \n",
+ "1 None None \n",
+ "2 None налог на добавленную стоимость \n",
+ "3 в том числе в федеральный бюджет None \n",
+ "4 4 5 \n",
+ ".. ... ... \n",
+ "95 9384 989177 \n",
+ "96 13819 249470 \n",
+ "97 72443 1110375 \n",
+ "98 1406 58193 \n",
+ "99 1076 25804 \n",
"\n",
- " кол. налогоплательщиков всего по налогу (сбору) по пени \\\n",
- "0 23632 30705385 18388429 4699527 \n",
- "1 9109 3013574 1661920 881213 \n",
- "2 39920 677495 115346 44047 \n",
- "3 58635 2161183 773472 556271 \n",
- "4 93 43115 31946 752 \n",
- "5 25104 4038755 2487632 1003525 \n",
- "6 16841363 60557919 41458668 19050086 \n",
- "7 1799585 133435115 36498879 19197715 \n",
- "8 40157 71402906 40477282 12510111 \n",
- "9 19496 4481612 2347431 1261980 \n",
- "10 107516 1397855 318751 93790 \n",
- "11 152898 5175662 1879757 1448942 \n",
- "12 165 79701 64933 1515 \n",
- "13 40759 7379705 4476555 1843784 \n",
- "14 16980357 61711637 42232603 19477495 \n",
- "15 3737334 340440384 62488024 31036806 \n",
+ " по штрафам \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 из графы 5 налог на добавленную стоимость по т... \n",
+ "3 None \n",
+ "4 6 \n",
+ ".. ... \n",
+ "95 989065 \n",
+ "96 249469 \n",
+ "97 1110306 \n",
+ "98 58037 \n",
+ "99 25804 \n",
"\n",
- " по штрафам проценты по ЕСН по страховым взносам \n",
- "0 1439452 184237 596724 5397016 \n",
- "1 242978 628 51928 174907 \n",
- "2 11984 0 5203 500915 \n",
- "3 70216 5769 154600 600855 \n",
- "4 76 0 5527 4814 \n",
- "5 215253 х 29516 302829 \n",
- "6 325 1 0 48839 \n",
- "7 5102099 12731 2055653 70568038 \n",
- "8 3381043 838205 1228143 12968122 \n",
- "9 345253 628 72728 453592 \n",
- "10 17866 0 8821 958627 \n",
- "11 185201 6911 255959 1398892 \n",
- "12 70 0 5565 7618 \n",
- "13 437216 х 73047 549103 \n",
- "14 676 725 0 138 \n",
- "15 7336091 34915 2494914 237049634 "
+ " проценты \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 платежи за пользование природными ресурсами \n",
+ "3 None \n",
+ "4 7 \n",
+ ".. ... \n",
+ "95 16638 \n",
+ "96 31627 \n",
+ "97 20647 \n",
+ "98 2554 \n",
+ "99 1336 \n",
+ "\n",
+ " по ЕСН \\\n",
+ "0 None \n",
+ "1 None \n",
+ "2 из графы 7\\n налог на добычу полезных ископаемых \n",
+ "3 None \n",
+ "4 8 \n",
+ ".. ... \n",
+ "95 877 \n",
+ "96 29256 \n",
+ "97 4214 \n",
+ "98 59 \n",
+ "99 857 \n",
+ "\n",
+ " по страховым взносам \n",
+ "0 None \n",
+ "1 None \n",
+ "2 остальные федеральные налоги и сборы \n",
+ "3 None \n",
+ "4 9 \n",
+ ".. ... \n",
+ "95 191247 \n",
+ "96 128853 \n",
+ "97 502243 \n",
+ "98 44941 \n",
+ "99 6632 \n",
+ "\n",
+ "[100 rows x 14 columns]"
]
},
"execution_count": 14,
@@ -1203,15 +1598,9 @@ "output_type": "execute_result"
}
],
"source": [
- "# пример как полностью настроить класс сразу при создании\n",
- "proc = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx',\n",
- " filename_parser=lambda filename: re.match(r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)', filename).groupdict(),\n",
- " append_global_index=True, append_wb_index=True,\n",
- " cells = {'form_name': 'J1', 'description': 'A2', 'form_dt': 'A3', 'unit_name': 'J4'},\n",
- " headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
- " first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=2\n",
- ")\n",
- "proc.get_dataframe()"
+ "# видим много пустых строк (без значений). Отфильтруем только те где есть хотя бы одно значение\n",
+ "proc.filter_empty_cells = True\n",
+ "proc.get_sample_dataframe(0, 200)"
]
}
],