git / code.ach.gov.ru / gavrin / jupyter_snippet

commit 70fe216e22369347f436f9a443821b0783621b04

author Дмитрий Сергеевич Гаврин <gavrin_ds@ach.gov.ru>

date 2020-04-02 14:54:40 +0300

parents 70536840

browse tree at this commit

message

добавлена фильтрация пустых ячеек

files

fileadddel
ExcelPreprocessor.ipynb+739-350

patch

diff --git a/ExcelPreprocessor.ipynb b/ExcelPreprocessor.ipynb
index fa3fe7e564758f134ab02415dc2617b951626194..02ca3a3f9bf1aaa43004ab62c24fc408d7b053ce 100644
--- a/ExcelPreprocessor.ipynb
+++ b/ExcelPreprocessor.ipynb
@@ -47,9 +47,9 @@       "Requirement already satisfied: openpyxl in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.3)\n",
       "Requirement already satisfied: xlrd in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (1.2.0)\n",
       "Requirement already satisfied: chardet in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.0.4)\n",
       "Requirement already satisfied: rarfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (3.1)\n",
+      "Requirement already satisfied: numpy>=1.13.3 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (1.18.1)\n",
       "Requirement already satisfied: pytz>=2017.2 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2019.3)\n",
       "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (2.8.1)\n",
-      "Requirement already satisfied: numpy>=1.13.3 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from pandas) (1.18.1)\n",
       "Requirement already satisfied: jdcal in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.4.1)\n",
       "Requirement already satisfied: et_xmlfile in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from openpyxl) (1.0.1)\n",
       "Requirement already satisfied: six>=1.5 in c:\\dev\\.jupyter\\.venv\\lib\\site-packages (from python-dateutil>=2.6.1->pandas) (1.14.0)\n"
@@ -96,7 +96,7 @@    "source": [
     "class ExcelPreprocessor:\n",
     "    def __init__(self, *wb, append_global_index=False, append_wb_index=False, \n",
     "                 filename_parser=None, append_ws_title_column=False, first_row_number=1,\n",
-    "                 iterate_over_worksheet=0, cells=None, headers=None, in_archive_re_mask=None):\n",
+    "                 iterate_over_worksheet=0, cells=None, headers=None, in_archive_re_mask=None, filter_empty_cells=False):\n",
     "        '''\n",
     "        параметр iterate_over_worksheet отвечает за то, какой именно рабочий лист в каждой прочитанной\n",
     "        книге будет разобран по умолчанию это лист с индексом 0, т.е. 1 в списке.\n",
@@ -122,6 +122,7 @@     "        self.iterate_over_worksheet = iterate_over_worksheet\n",
     "        self.cells = cells\n",
     "        self.headers = headers\n",
     "        self.in_archive_re_mask = in_archive_re_mask\n",
+    "        self.filter_empty_cells = filter_empty_cells\n",
     "\n",
     "    def __repr__(self):\n",
     "        return f'''{ExcelPreprocessor.__name__}{self.wb}:\n",
@@ -156,7 +157,6 @@     "                from zipfile import ZipFile\n",
     "                import chardet\n",
     "                ZIP_FILENAME_UTF8_FLAG = 0x800\n",
     "                zf = ZipFile(rf , 'r')\n",
-    "                print(filename)\n",
     "                for info in zf.filelist:\n",
     "                    zfilename = info.filename\n",
     "                    if not zfilename.endswith('.xlsx'):\n",
@@ -170,14 +170,6 @@     "                            if not re.match(self.in_archive_re_mask, zfilename):\n",
     "                                continue\n",
     "                        yield str(Path(filename) / Path(zfilename)), zf.open(info.filename, 'r')\n",
     "\n",
-    "            if filename.endswith('.rar'):\n",
-    "                from rarfile import RarFile\n",
-    "                rf = rarfile.RarFile(rar_fn, 'r')\n",
-    "                for f in rf.infolist():\n",
-    "                    print(f.filename, f.file_size)\n",
-    "                    if f.filename == 'README':\n",
-    "                        print(rf.read(f))\n",
-    "\n",
     "    @property\n",
     "    def _columns(self):\n",
     "        columns = []\n",
@@ -212,7 +204,10 @@     "                iter_sheet = [i for i, ws in enumerate(wb.worksheets) if not ws.title.startswith('hidden')]\n",
     "\n",
     "            for ws_index in iter_sheet:\n",
     "                ws = wb.worksheets[ws_index]\n",
-    "                result = ((cell.value for cell in row) for row in ws.rows)\n",
+    "                if self.filter_empty_cells:\n",
+    "                    result = ((cell.value for cell in row) for row in ws.rows if any(cell.value for cell in row))\n",
+    "                else:\n",
+    "                    result = ((cell.value for cell in row) for row in ws.rows)\n",
     "\n",
     "                if self.first_row_number:\n",
     "                    result = islice(result, self.first_row_number - 1, None)\n",
@@ -471,7 +466,7 @@    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -656,7 +651,7 @@        "3   221615329  220703752  5920609               5756088  \n",
        "4    47161217   47056987    36418                 27229  "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -668,7 +663,7 @@    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -830,371 +825,771 @@        "      <td>0</td>\n",
        "      <td>5527</td>\n",
        "      <td>4814</td>\n",
        "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   global_index  ws_index  year   filename extension                ws_title  \\\n",
+       "0             0         0  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
+       "1             1         1  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
+       "2             2         2  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
+       "3             3         3  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
+       "4             4         4  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
+       "\n",
+       "      form_name                                        description  \\\n",
+       "0  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "1  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "2  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "3  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "4  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "\n",
+       "                         form_dt    unit_name  \\\n",
+       "0  по состоянию на 01.06.2018 г.  тыс. рублей   \n",
+       "1  по состоянию на 01.06.2018 г.  тыс. рублей   \n",
+       "2  по состоянию на 01.06.2018 г.  тыс. рублей   \n",
+       "3  по состоянию на 01.06.2018 г.  тыс. рублей   \n",
+       "4  по состоянию на 01.06.2018 г.  тыс. рублей   \n",
+       "\n",
+       "                                                поле  код строки  \\\n",
+       "0  Сумма списанной задолженности организаций, лик...        2400   \n",
+       "1  Сумма списанной задолженности индивидуальных п...        2405   \n",
+       "2  Сумма списанной задолженности умерших или объя...        2410   \n",
+       "3  Сумма списанной задолженности в случаях принят...        2415   \n",
+       "4  Сумма списанной задолженности по \"зависшим\" пл...        2420   \n",
+       "\n",
+       "   кол. налогоплательщиков     всего  по налогу (сбору)  по пени  по штрафам  \\\n",
+       "0                    23632  30705385           18388429  4699527     1439452   \n",
+       "1                     9109   3013574            1661920   881213      242978   \n",
+       "2                    39920    677495             115346    44047       11984   \n",
+       "3                    58635   2161183             773472   556271       70216   \n",
+       "4                       93     43115              31946      752          76   \n",
+       "\n",
+       "  проценты  по ЕСН  по страховым взносам  \n",
+       "0   184237  596724               5397016  \n",
+       "1      628   51928                174907  \n",
+       "2        0    5203                500915  \n",
+       "3     5769  154600                600855  \n",
+       "4        0    5527                  4814  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# пример как полностью настроить класс сразу при создании\n",
+    "proc = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx',\n",
+    "    filename_parser=lambda filename: re.match(r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)', filename).groupdict(),\n",
+    "    append_global_index=True, append_wb_index=True,\n",
+    "    cells = {'form_name': 'J1', 'description': 'A2', 'form_dt': 'A3', 'unit_name': 'J4'},\n",
+    "    headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
+    "    first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=2\n",
+    ")\n",
+    "proc.get_dataframe()[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>global_index</th>\n",
+       "      <th>ws_index</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>ws_title</th>\n",
+       "      <th>поле</th>\n",
+       "      <th>код строки</th>\n",
+       "      <th>кол. налогоплательщиков</th>\n",
+       "      <th>всего</th>\n",
+       "      <th>по налогу (сбору)</th>\n",
+       "      <th>по пени</th>\n",
+       "      <th>по штрафам</th>\n",
+       "      <th>проценты</th>\n",
+       "      <th>по ЕСН</th>\n",
+       "      <th>по страховым взносам</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>4nm010618</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.06.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности организаций, отв...</td>\n",
-       "      <td>2425</td>\n",
-       "      <td>25104</td>\n",
-       "      <td>4038755</td>\n",
-       "      <td>2487632</td>\n",
-       "      <td>1003525</td>\n",
-       "      <td>215253</td>\n",
-       "      <td>х</td>\n",
-       "      <td>29516</td>\n",
-       "      <td>302829</td>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>федеральным налогам и сборам</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>4nm010618</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.06.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности по решениям нало...</td>\n",
-       "      <td>2430</td>\n",
-       "      <td>16841363</td>\n",
-       "      <td>60557919</td>\n",
-       "      <td>41458668</td>\n",
-       "      <td>19050086</td>\n",
-       "      <td>325</td>\n",
+       "      <th>1</th>\n",
        "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>48839</td>\n",
+       "      <td>1</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Всего</td>\n",
+       "      <td>из них</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>налог на прибыль организаций</td>\n",
+       "      <td>None</td>\n",
+       "      <td>налог на добавленную стоимость</td>\n",
+       "      <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
+       "      <td>платежи за пользование природными ресурсами</td>\n",
+       "      <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
+       "      <td>остальные федеральные налоги и сборы</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Всего</td>\n",
+       "      <td>в том числе в федеральный бюджет</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>А</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "      <td>6</td>\n",
        "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>4nm010618</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.06.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма задолженности, списанной на  основании з...</td>\n",
-       "      <td>2435</td>\n",
-       "      <td>1799585</td>\n",
-       "      <td>133435115</td>\n",
-       "      <td>36498879</td>\n",
-       "      <td>19197715</td>\n",
-       "      <td>5102099</td>\n",
-       "      <td>12731</td>\n",
-       "      <td>2055653</td>\n",
-       "      <td>70568038</td>\n",
+       "      <td>8</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>195</th>\n",
+       "      <td>195</td>\n",
+       "      <td>195</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>8</td>\n",
+       "      <th>196</th>\n",
+       "      <td>196</td>\n",
+       "      <td>196</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>197</th>\n",
+       "      <td>197</td>\n",
+       "      <td>197</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>198</th>\n",
+       "      <td>198</td>\n",
+       "      <td>198</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>199</th>\n",
+       "      <td>199</td>\n",
+       "      <td>199</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>200 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     global_index  ws_index  \\\n",
+       "0               0         0   \n",
+       "1               1         1   \n",
+       "2               2         2   \n",
+       "3               3         3   \n",
+       "4               4         4   \n",
+       "..            ...       ...   \n",
+       "195           195       195   \n",
+       "196           196       196   \n",
+       "197           197       197   \n",
+       "198           198       198   \n",
+       "199           199       199   \n",
+       "\n",
+       "                                              filename ws_title  поле  \\\n",
+       "0    https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "1    https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "2    https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "3    https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "4    https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005     А   \n",
+       "..                                                 ...      ...   ...   \n",
+       "195  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "196  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "197  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "198  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "199  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...     1005  None   \n",
+       "\n",
+       "     код строки       кол. налогоплательщиков                         всего  \\\n",
+       "0           NaN  федеральным налогам и сборам                          None   \n",
+       "1           NaN                        Всего                         из них   \n",
+       "2           NaN                          None  налог на прибыль организаций   \n",
+       "3           NaN                          None                        Всего    \n",
+       "4           1.0                             2                             3   \n",
+       "..          ...                           ...                           ...   \n",
+       "195         NaN                          None                          None   \n",
+       "196         NaN                          None                          None   \n",
+       "197         NaN                          None                          None   \n",
+       "198         NaN                          None                          None   \n",
+       "199         NaN                          None                          None   \n",
+       "\n",
+       "                    по налогу (сбору)                          по пени  \\\n",
+       "0                                None                             None   \n",
+       "1                                None                             None   \n",
+       "2                                None  налог на добавленную стоимость    \n",
+       "3    в том числе в федеральный бюджет                             None   \n",
+       "4                                   4                                5   \n",
+       "..                                ...                              ...   \n",
+       "195                              None                             None   \n",
+       "196                              None                             None   \n",
+       "197                              None                             None   \n",
+       "198                              None                             None   \n",
+       "199                              None                             None   \n",
+       "\n",
+       "                                            по штрафам  \\\n",
+       "0                                                 None   \n",
+       "1                                                 None   \n",
+       "2    из графы 5 налог на добавленную стоимость по т...   \n",
+       "3                                                 None   \n",
+       "4                                                    6   \n",
+       "..                                                 ...   \n",
+       "195                                               None   \n",
+       "196                                               None   \n",
+       "197                                               None   \n",
+       "198                                               None   \n",
+       "199                                               None   \n",
+       "\n",
+       "                                         проценты  \\\n",
+       "0                                            None   \n",
+       "1                                            None   \n",
+       "2    платежи за пользование природными ресурсами    \n",
+       "3                                            None   \n",
+       "4                                               7   \n",
+       "..                                            ...   \n",
+       "195                                          None   \n",
+       "196                                          None   \n",
+       "197                                          None   \n",
+       "198                                          None   \n",
+       "199                                          None   \n",
+       "\n",
+       "                                               по ЕСН  \\\n",
+       "0                                                None   \n",
+       "1                                                None   \n",
+       "2    из графы 7\\n налог на добычу полезных ископаемых   \n",
+       "3                                                None   \n",
+       "4                                                   8   \n",
+       "..                                                ...   \n",
+       "195                                              None   \n",
+       "196                                              None   \n",
+       "197                                              None   \n",
+       "198                                              None   \n",
+       "199                                              None   \n",
+       "\n",
+       "                     по страховым взносам  \n",
+       "0                                    None  \n",
+       "1                                    None  \n",
+       "2    остальные федеральные налоги и сборы  \n",
+       "3                                    None  \n",
+       "4                                       9  \n",
+       "..                                    ...  \n",
+       "195                                  None  \n",
+       "196                                  None  \n",
+       "197                                  None  \n",
+       "198                                  None  \n",
+       "199                                  None  \n",
+       "\n",
+       "[200 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# пробуем извлечь данные из архива, и только один файл\n",
+    "proc = ExcelPreprocessor('https://www.nalog.ru/html/sites/www.new.nalog.ru/docs/otchet/4nm011018reg.zip',\n",
+    "    filename_parser=lambda filename: {'filename': filename},\n",
+    "    append_global_index=True, append_wb_index=True,\n",
+    "    headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
+    "    first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=0, in_archive_re_mask=r'.*Раздел( )?1.*\\.xlsx'\n",
+    ")\n",
+    "proc.get_sample_dataframe(0, 200)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>global_index</th>\n",
+       "      <th>ws_index</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>ws_title</th>\n",
+       "      <th>поле</th>\n",
+       "      <th>код строки</th>\n",
+       "      <th>кол. налогоплательщиков</th>\n",
+       "      <th>всего</th>\n",
+       "      <th>по налогу (сбору)</th>\n",
+       "      <th>по пени</th>\n",
+       "      <th>по штрафам</th>\n",
+       "      <th>проценты</th>\n",
+       "      <th>по ЕСН</th>\n",
+       "      <th>по страховым взносам</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
        "      <td>0</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности организаций, лик...</td>\n",
-       "      <td>2400</td>\n",
-       "      <td>40157</td>\n",
-       "      <td>71402906</td>\n",
-       "      <td>40477282</td>\n",
-       "      <td>12510111</td>\n",
-       "      <td>3381043</td>\n",
-       "      <td>838205</td>\n",
-       "      <td>1228143</td>\n",
-       "      <td>12968122</td>\n",
+       "      <td>0</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>федеральным налогам и сборам</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>9</td>\n",
+       "      <th>1</th>\n",
        "      <td>1</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности индивидуальных п...</td>\n",
-       "      <td>2405</td>\n",
-       "      <td>19496</td>\n",
-       "      <td>4481612</td>\n",
-       "      <td>2347431</td>\n",
-       "      <td>1261980</td>\n",
-       "      <td>345253</td>\n",
-       "      <td>628</td>\n",
-       "      <td>72728</td>\n",
-       "      <td>453592</td>\n",
+       "      <td>1</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Всего</td>\n",
+       "      <td>из них</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>10</td>\n",
+       "      <th>2</th>\n",
        "      <td>2</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности умерших или объя...</td>\n",
-       "      <td>2410</td>\n",
-       "      <td>107516</td>\n",
-       "      <td>1397855</td>\n",
-       "      <td>318751</td>\n",
-       "      <td>93790</td>\n",
-       "      <td>17866</td>\n",
-       "      <td>0</td>\n",
-       "      <td>8821</td>\n",
-       "      <td>958627</td>\n",
+       "      <td>2</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>налог на прибыль организаций</td>\n",
+       "      <td>None</td>\n",
+       "      <td>налог на добавленную стоимость</td>\n",
+       "      <td>из графы 5 налог на добавленную стоимость по т...</td>\n",
+       "      <td>платежи за пользование природными ресурсами</td>\n",
+       "      <td>из графы 7\\n налог на добычу полезных ископаемых</td>\n",
+       "      <td>остальные федеральные налоги и сборы</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>11</td>\n",
+       "      <th>3</th>\n",
        "      <td>3</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности в случаях принят...</td>\n",
-       "      <td>2415</td>\n",
-       "      <td>152898</td>\n",
-       "      <td>5175662</td>\n",
-       "      <td>1879757</td>\n",
-       "      <td>1448942</td>\n",
-       "      <td>185201</td>\n",
-       "      <td>6911</td>\n",
-       "      <td>255959</td>\n",
-       "      <td>1398892</td>\n",
+       "      <td>3</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Всего</td>\n",
+       "      <td>в том числе в федеральный бюджет</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>12</td>\n",
+       "      <th>4</th>\n",
        "      <td>4</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности по \"зависшим\" пл...</td>\n",
-       "      <td>2420</td>\n",
-       "      <td>165</td>\n",
-       "      <td>79701</td>\n",
-       "      <td>64933</td>\n",
-       "      <td>1515</td>\n",
-       "      <td>70</td>\n",
-       "      <td>0</td>\n",
-       "      <td>5565</td>\n",
-       "      <td>7618</td>\n",
+       "      <td>4</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>А</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "      <td>6</td>\n",
+       "      <td>7</td>\n",
+       "      <td>8</td>\n",
+       "      <td>9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>13</td>\n",
-       "      <td>5</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности организаций, отв...</td>\n",
-       "      <td>2425</td>\n",
-       "      <td>40759</td>\n",
-       "      <td>7379705</td>\n",
-       "      <td>4476555</td>\n",
-       "      <td>1843784</td>\n",
-       "      <td>437216</td>\n",
-       "      <td>х</td>\n",
-       "      <td>73047</td>\n",
-       "      <td>549103</td>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>14</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма списанной задолженности по решениям нало...</td>\n",
-       "      <td>2430</td>\n",
-       "      <td>16980357</td>\n",
-       "      <td>61711637</td>\n",
-       "      <td>42232603</td>\n",
-       "      <td>19477495</td>\n",
-       "      <td>676</td>\n",
-       "      <td>725</td>\n",
-       "      <td>0</td>\n",
-       "      <td>138</td>\n",
+       "      <th>95</th>\n",
+       "      <td>95</td>\n",
+       "      <td>95</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>Камчатский край</td>\n",
+       "      <td>3060920.0</td>\n",
+       "      <td>1273911</td>\n",
+       "      <td>76849</td>\n",
+       "      <td>9384</td>\n",
+       "      <td>989177</td>\n",
+       "      <td>989065</td>\n",
+       "      <td>16638</td>\n",
+       "      <td>877</td>\n",
+       "      <td>191247</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>15</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>4nm011118</td>\n",
-       "      <td>xlsx</td>\n",
-       "      <td>Р. Справочно1_Списание</td>\n",
-       "      <td>Форма № 4-НМ</td>\n",
-       "      <td>Справочно к Разделам I, II:      1_Списано зад...</td>\n",
-       "      <td>по состоянию на 01.11.2018 г.</td>\n",
-       "      <td>тыс. рублей</td>\n",
-       "      <td>Сумма задолженности, списанной на  основании з...</td>\n",
-       "      <td>2435</td>\n",
-       "      <td>3737334</td>\n",
-       "      <td>340440384</td>\n",
-       "      <td>62488024</td>\n",
-       "      <td>31036806</td>\n",
-       "      <td>7336091</td>\n",
-       "      <td>34915</td>\n",
-       "      <td>2494914</td>\n",
-       "      <td>237049634</td>\n",
+       "      <th>96</th>\n",
+       "      <td>96</td>\n",
+       "      <td>96</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>Магаданская область</td>\n",
+       "      <td>1118695.0</td>\n",
+       "      <td>531438</td>\n",
+       "      <td>121488</td>\n",
+       "      <td>13819</td>\n",
+       "      <td>249470</td>\n",
+       "      <td>249469</td>\n",
+       "      <td>31627</td>\n",
+       "      <td>29256</td>\n",
+       "      <td>128853</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>97</td>\n",
+       "      <td>97</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>Сахалинская область</td>\n",
+       "      <td>4404357.0</td>\n",
+       "      <td>2091467</td>\n",
+       "      <td>458202</td>\n",
+       "      <td>72443</td>\n",
+       "      <td>1110375</td>\n",
+       "      <td>1110306</td>\n",
+       "      <td>20647</td>\n",
+       "      <td>4214</td>\n",
+       "      <td>502243</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>98</td>\n",
+       "      <td>98</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>Еврейская автономная область</td>\n",
+       "      <td>540981.0</td>\n",
+       "      <td>120118</td>\n",
+       "      <td>14430</td>\n",
+       "      <td>1406</td>\n",
+       "      <td>58193</td>\n",
+       "      <td>58037</td>\n",
+       "      <td>2554</td>\n",
+       "      <td>59</td>\n",
+       "      <td>44941</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>99</td>\n",
+       "      <td>99</td>\n",
+       "      <td>https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...</td>\n",
+       "      <td>1005</td>\n",
+       "      <td>Чукотский АО</td>\n",
+       "      <td>61052.0</td>\n",
+       "      <td>44624</td>\n",
+       "      <td>10852</td>\n",
+       "      <td>1076</td>\n",
+       "      <td>25804</td>\n",
+       "      <td>25804</td>\n",
+       "      <td>1336</td>\n",
+       "      <td>857</td>\n",
+       "      <td>6632</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>100 rows × 14 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "    global_index  ws_index  year   filename extension                ws_title  \\\n",
-       "0              0         0  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "1              1         1  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "2              2         2  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "3              3         3  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "4              4         4  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "5              5         5  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "6              6         6  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "7              7         7  2009  4nm010618      xlsx  Р. Справочно1_Списание   \n",
-       "8              8         0  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "9              9         1  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "10            10         2  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "11            11         3  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "12            12         4  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "13            13         5  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "14            14         6  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
-       "15            15         7  2008  4nm011118      xlsx  Р. Справочно1_Списание   \n",
+       "    global_index  ws_index                                           filename  \\\n",
+       "0              0         0  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "1              1         1  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "2              2         2  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "3              3         3  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "4              4         4  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "..           ...       ...                                                ...   \n",
+       "95            95        95  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "96            96        96  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "97            97        97  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "98            98        98  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
+       "99            99        99  https:\\www.nalog.ru\\html\\sites\\www.new.nalog.r...   \n",
        "\n",
-       "       form_name                                        description  \\\n",
-       "0   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "1   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "2   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "3   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "4   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "5   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "6   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "7   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "8   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "9   Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "10  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "11  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "12  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "13  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "14  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
-       "15  Форма № 4-НМ  Справочно к Разделам I, II:      1_Списано зад...   \n",
+       "   ws_title                          поле  код строки  \\\n",
+       "0      1005                          None         NaN   \n",
+       "1      1005                          None         NaN   \n",
+       "2      1005                          None         NaN   \n",
+       "3      1005                          None         NaN   \n",
+       "4      1005                             А         1.0   \n",
+       "..      ...                           ...         ...   \n",
+       "95     1005               Камчатский край   3060920.0   \n",
+       "96     1005           Магаданская область   1118695.0   \n",
+       "97     1005           Сахалинская область   4404357.0   \n",
+       "98     1005  Еврейская автономная область    540981.0   \n",
+       "99     1005                  Чукотский АО     61052.0   \n",
        "\n",
-       "                          form_dt    unit_name  \\\n",
-       "0   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "1   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "2   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "3   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "4   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "5   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "6   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "7   по состоянию на 01.06.2018 г.  тыс. рублей   \n",
-       "8   по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "9   по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "10  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "11  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "12  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "13  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "14  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
-       "15  по состоянию на 01.11.2018 г.  тыс. рублей   \n",
+       "         кол. налогоплательщиков                         всего  \\\n",
+       "0   федеральным налогам и сборам                          None   \n",
+       "1                         Всего                         из них   \n",
+       "2                           None  налог на прибыль организаций   \n",
+       "3                           None                        Всего    \n",
+       "4                              2                             3   \n",
+       "..                           ...                           ...   \n",
+       "95                       1273911                         76849   \n",
+       "96                        531438                        121488   \n",
+       "97                       2091467                        458202   \n",
+       "98                        120118                         14430   \n",
+       "99                         44624                         10852   \n",
        "\n",
-       "                                                 поле  код строки  \\\n",
-       "0   Сумма списанной задолженности организаций, лик...        2400   \n",
-       "1   Сумма списанной задолженности индивидуальных п...        2405   \n",
-       "2   Сумма списанной задолженности умерших или объя...        2410   \n",
-       "3   Сумма списанной задолженности в случаях принят...        2415   \n",
-       "4   Сумма списанной задолженности по \"зависшим\" пл...        2420   \n",
-       "5   Сумма списанной задолженности организаций, отв...        2425   \n",
-       "6   Сумма списанной задолженности по решениям нало...        2430   \n",
-       "7   Сумма задолженности, списанной на  основании з...        2435   \n",
-       "8   Сумма списанной задолженности организаций, лик...        2400   \n",
-       "9   Сумма списанной задолженности индивидуальных п...        2405   \n",
-       "10  Сумма списанной задолженности умерших или объя...        2410   \n",
-       "11  Сумма списанной задолженности в случаях принят...        2415   \n",
-       "12  Сумма списанной задолженности по \"зависшим\" пл...        2420   \n",
-       "13  Сумма списанной задолженности организаций, отв...        2425   \n",
-       "14  Сумма списанной задолженности по решениям нало...        2430   \n",
-       "15  Сумма задолженности, списанной на  основании з...        2435   \n",
+       "                   по налогу (сбору)                          по пени  \\\n",
+       "0                               None                             None   \n",
+       "1                               None                             None   \n",
+       "2                               None  налог на добавленную стоимость    \n",
+       "3   в том числе в федеральный бюджет                             None   \n",
+       "4                                  4                                5   \n",
+       "..                               ...                              ...   \n",
+       "95                              9384                           989177   \n",
+       "96                             13819                           249470   \n",
+       "97                             72443                          1110375   \n",
+       "98                              1406                            58193   \n",
+       "99                              1076                            25804   \n",
        "\n",
-       "    кол. налогоплательщиков      всего  по налогу (сбору)   по пени  \\\n",
-       "0                     23632   30705385           18388429   4699527   \n",
-       "1                      9109    3013574            1661920    881213   \n",
-       "2                     39920     677495             115346     44047   \n",
-       "3                     58635    2161183             773472    556271   \n",
-       "4                        93      43115              31946       752   \n",
-       "5                     25104    4038755            2487632   1003525   \n",
-       "6                  16841363   60557919           41458668  19050086   \n",
-       "7                   1799585  133435115           36498879  19197715   \n",
-       "8                     40157   71402906           40477282  12510111   \n",
-       "9                     19496    4481612            2347431   1261980   \n",
-       "10                   107516    1397855             318751     93790   \n",
-       "11                   152898    5175662            1879757   1448942   \n",
-       "12                      165      79701              64933      1515   \n",
-       "13                    40759    7379705            4476555   1843784   \n",
-       "14                 16980357   61711637           42232603  19477495   \n",
-       "15                  3737334  340440384           62488024  31036806   \n",
+       "                                           по штрафам  \\\n",
+       "0                                                None   \n",
+       "1                                                None   \n",
+       "2   из графы 5 налог на добавленную стоимость по т...   \n",
+       "3                                                None   \n",
+       "4                                                   6   \n",
+       "..                                                ...   \n",
+       "95                                             989065   \n",
+       "96                                             249469   \n",
+       "97                                            1110306   \n",
+       "98                                              58037   \n",
+       "99                                              25804   \n",
        "\n",
-       "    по штрафам проценты   по ЕСН  по страховым взносам  \n",
-       "0      1439452   184237   596724               5397016  \n",
-       "1       242978      628    51928                174907  \n",
-       "2        11984        0     5203                500915  \n",
-       "3        70216     5769   154600                600855  \n",
-       "4           76        0     5527                  4814  \n",
-       "5       215253        х    29516                302829  \n",
-       "6          325        1        0                 48839  \n",
-       "7      5102099    12731  2055653              70568038  \n",
-       "8      3381043   838205  1228143              12968122  \n",
-       "9       345253      628    72728                453592  \n",
-       "10       17866        0     8821                958627  \n",
-       "11      185201     6911   255959               1398892  \n",
-       "12          70        0     5565                  7618  \n",
-       "13      437216        х    73047                549103  \n",
-       "14         676      725        0                   138  \n",
-       "15     7336091    34915  2494914             237049634  "
+       "                                        проценты  \\\n",
+       "0                                           None   \n",
+       "1                                           None   \n",
+       "2   платежи за пользование природными ресурсами    \n",
+       "3                                           None   \n",
+       "4                                              7   \n",
+       "..                                           ...   \n",
+       "95                                         16638   \n",
+       "96                                         31627   \n",
+       "97                                         20647   \n",
+       "98                                          2554   \n",
+       "99                                          1336   \n",
+       "\n",
+       "                                              по ЕСН  \\\n",
+       "0                                               None   \n",
+       "1                                               None   \n",
+       "2   из графы 7\\n налог на добычу полезных ископаемых   \n",
+       "3                                               None   \n",
+       "4                                                  8   \n",
+       "..                                               ...   \n",
+       "95                                               877   \n",
+       "96                                             29256   \n",
+       "97                                              4214   \n",
+       "98                                                59   \n",
+       "99                                               857   \n",
+       "\n",
+       "                    по страховым взносам  \n",
+       "0                                   None  \n",
+       "1                                   None  \n",
+       "2   остальные федеральные налоги и сборы  \n",
+       "3                                   None  \n",
+       "4                                      9  \n",
+       "..                                   ...  \n",
+       "95                                191247  \n",
+       "96                                128853  \n",
+       "97                                502243  \n",
+       "98                                 44941  \n",
+       "99                                  6632  \n",
+       "\n",
+       "[100 rows x 14 columns]"
       ]
      },
      "execution_count": 14,
@@ -1203,15 +1598,9 @@      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# пример как полностью настроить класс сразу при создании\n",
-    "proc = ExcelPreprocessor('data\\\\2009\\\\4nm010618.xlsx', 'data\\\\2008\\\\4nm011118.xlsx',\n",
-    "    filename_parser=lambda filename: re.match(r'data\\\\(?P<year>\\w+)\\\\(?P<filename>\\w+)\\.(?P<extension>\\w+)', filename).groupdict(),\n",
-    "    append_global_index=True, append_wb_index=True,\n",
-    "    cells = {'form_name': 'J1', 'description': 'A2', 'form_dt': 'A3', 'unit_name': 'J4'},\n",
-    "    headers = ['поле', 'код строки', 'кол. налогоплательщиков', 'всего', 'по налогу (сбору)', 'по пени', 'по штрафам', 'проценты', 'по ЕСН', 'по страховым взносам'],\n",
-    "    first_row_number=7, append_ws_title_column=True, iterate_over_worksheet=2\n",
-    ")\n",
-    "proc.get_dataframe()"
+    "# видим много пустых строк (без значений). Отфильтруем только те где есть хотя бы одно значение\n",
+    "proc.filter_empty_cells = True\n",
+    "proc.get_sample_dataframe(0, 200)"
    ]
   }
  ],