add table.py

2025-08-30 23:30:59 +03:00
commit 3b3f7097df
2 changed files with 605 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pdf
+*.html
--- a/table.py
+++ b/table.py
@@ -0,0 +1,603 @@
+import pdfplumber
+import re
+import html
+
+def is_empty_row(row):
+    """Проверяет, является ли строка полностью пустой."""
+    if not row:
+        return True
+    for cell in row:
+        cell_content = str(cell).strip() if cell is not None else ""
+        if cell_content:
+            return False
+    return True
+
+def remove_empty_rows(table):
+    """Удаляет полностью пустые строки из таблицы."""
+    if not table:
+        return []
+    return [row for row in table if not is_empty_row(row)]
+
+def should_merge_row(row):
+    """Проверяет, нужно ли объединять ячейки в строке по горизонтали."""
+    if not row:
+        return False
+    first_cell = str(row[0]).strip() if row[0] is not None else ""
+    if not first_cell:
+        return False
+    for cell in row[1:]:
+        cell_content = str(cell).strip() if cell is not None else ""
+        if cell_content:
+            return False
+    return True
+
+def is_ogrn_continuation(cell_value):
+    """
+    Проверяет, является ли значение продолжением ОГРН записи.
+    Возвращает True, если значение начинается с 'ОГРН:'.
+    """
+    if not cell_value:
+        return False
+    
+    cell_str = str(cell_value).strip()
+    return cell_str.startswith('ОГРН:') or cell_str.startswith('ОГРН :')
+
+def get_last_non_empty_value(table, column_index):
+    """
+    Возвращает последнее непустое значение в указанной колонке таблицы.
+    Ищет снизу вверх.
+    """
+    if not table:
+        return None
+    
+    for row in reversed(table):
+        if column_index < len(row):
+            cell_value = str(row[column_index]).strip() if row[column_index] is not None else ""
+            if cell_value:
+                return cell_value
+    return None
+
+def get_first_non_empty_value(table, column_index):
+    """
+    Возвращает первое непустое значение в указанной колонке таблицы.
+    Ищет сверху вниз.
+    """
+    if not table:
+        return None
+    
+    for row in table:
+        if column_index < len(row):
+            cell_value = str(row[column_index]).strip() if row[column_index] is not None else ""
+            if cell_value:
+                return cell_value
+    return None
+
+def is_pagination_continuation(prev_table, new_table):
+    """
+    Проверяет, является ли новая таблица продолжением предыдущей из-за пагинации.
+    Новое условие: если в первой колонке разные значения, но в последней колонке пусто,
+    то это общее значение, которое съехало на вторую страницу.
+    """
+    if not prev_table or not new_table:
+        return False
+    
+    # Получаем последние значения из предыдущей таблицы
+    prev_first_col = get_last_non_empty_value(prev_table, 0)
+    prev_last_col = get_last_non_empty_value(prev_table, -1)
+    
+    # Получаем первые значения из новой таблицы
+    new_first_col = get_first_non_empty_value(new_table, 0)
+    new_last_col = get_first_non_empty_value(new_table, -1)
+    
+    # Новое условие: разные значения в первой колонке И пустая последняя колонка в новой таблице
+    if (prev_first_col and new_first_col and 
+        prev_first_col != new_first_col and 
+        not new_last_col):
+        return True
+    
+    return False
+
+def is_continuation_table(prev_table, new_table):
+    """
+    Проверяет, является ли новая таблица продолжением предыдущей.
+    Сравнивает последние непустые значения первой и последней колонок.
+    """
+    if not prev_table or not new_table:
+        return False
+    
+    # Если первая строка новой таблицы содержит ОГРН продолжение - это продолжение
+    if new_table and new_table[0] and is_ogrn_continuation(new_table[0][0]):
+        return True
+    
+    # Проверяем пагинационное продолжение (новое условие)
+    if is_pagination_continuation(prev_table, new_table):
+        return True
+    
+    # Получаем последние непустые значения из предыдущей таблицы
+    prev_first_col_value = get_last_non_empty_value(prev_table, 0)
+    prev_last_col_value = get_last_non_empty_value(prev_table, -1)
+    
+    # Получаем первые непустые значения из новой таблиции
+    new_first_col_value = get_first_non_empty_value(new_table, 0)
+    new_last_col_value = get_first_non_empty_value(new_table, -1)
+    
+    # Если оба значения совпадают и не пустые - это продолжение
+    if (prev_first_col_value and new_first_col_value and 
+        prev_first_col_value == new_first_col_value and
+        prev_last_col_value and new_last_col_value and
+        prev_last_col_value == new_last_col_value):
+        return True
+    
+    return False
+
+def merge_pagination_continuation(prev_table, new_table):
+    """
+    Объединяет таблицы при пагинационном продолжении.
+    Склеивает общее значение, которое съехало на вторую страницу.
+    """
+    if not prev_table or not new_table:
+        return prev_table
+    
+    # Получаем последнее значение из предыдущей таблицы (общее значение)
+    last_prev_value = get_last_non_empty_value(prev_table, 0)
+    
+    if not last_prev_value:
+        return prev_table
+    
+    # Находим индекс последней непустой строки в предыдущей таблице
+    last_row_index = -1
+    for i in range(len(prev_table) - 1, -1, -1):
+        if not is_empty_row(prev_table[i]):
+            last_row_index = i
+            break
+    
+    if last_row_index < 0:
+        return prev_table
+    
+    # Объединяем значения: добавляем первое значение новой таблицы к последнему значению предыдущей
+    first_new_value = get_first_non_empty_value(new_table, 0)
+    if first_new_value:
+        # Объединяем значения через пробел
+        merged_value = f"{last_prev_value} {first_new_value}"
+        
+        # Заменяем значение в предыдущей таблице
+        prev_table[last_row_index] = [merged_value] + list(prev_table[last_row_index][1:])
+        
+        # Удаляем первую строку из новой таблицы (так как мы её уже объединили)
+        if len(new_table) > 1:
+            return prev_table + new_table[1:]
+        else:
+            return prev_table
+    
+    return prev_table + new_table
+
+def merge_ogrn_continuation_tables(all_tables):
+    """
+    Объединяет таблицы, где первая строка содержит продолжение ОГРН.
+    """
+    if not all_tables:
+        return []
+    
+    merged_tables = []
+    current_table = None
+    
+    for i, table in enumerate(all_tables):
+        if not table:
+            continue
+            
+        cleaned_table = remove_empty_rows(table)
+        if not cleaned_table:
+            continue
+        
+        if current_table is None:
+            current_table = cleaned_table
+            continue
+        
+        # Проверяем, является ли первая строка продолжением ОГРН
+        first_row = cleaned_table[0] if cleaned_table else []
+        first_cell = first_row[0] if first_row and len(first_row) > 0 else ""
+        
+        if is_ogrn_continuation(first_cell):
+            # Это продолжение ОГРН - объединяем с предыдущей таблицей
+            print(f"Таблица {i+1} содержит продолжение ОГРН - объединяем")
+            
+            # Находим последнюю непустую строку в текущей таблице
+            last_row_index = -1
+            for j in range(len(current_table) - 1, -1, -1):
+                if not is_empty_row(current_table[j]):
+                    last_row_index = j
+                    break
+            
+            if last_row_index >= 0:
+                # Объединяем ОГРН значение с последней строкой
+                last_row = current_table[last_row_index]
+                if len(last_row) > 0:
+                    # Добавляем ОГРН значение к последней ячейке первой колонки
+                    ogrn_value = str(first_cell).strip()
+                    last_row_value = str(last_row[0]).strip() if last_row[0] is not None else ""
+                    
+                    if last_row_value:
+                        # Объединяем значения
+                        merged_value = f"{last_row_value} {ogrn_value}"
+                        current_table[last_row_index] = [merged_value] + list(last_row[1:])
+                    
+                    # Добавляем остальные строки из новой таблицы (кроме первой)
+                    if len(cleaned_table) > 1:
+                        current_table.extend(cleaned_table[1:])
+                else:
+                    current_table.extend(cleaned_table)
+            else:
+                current_table.extend(cleaned_table)
+        else:
+            # Проверяем обычное продолжение таблицы
+            if is_continuation_table(current_table, cleaned_table):
+                print(f"Таблица {i+1} является продолжением - объединяем")
+                
+                # Проверяем специальный случай пагинационного продолжения
+                if is_pagination_continuation(current_table, cleaned_table):
+                    current_table = merge_pagination_continuation(current_table, cleaned_table)
+                else:
+                    # Находим с какой строки начинать добавление (пропускаем дублирующиеся значения)
+                    start_index = 0
+                    for j, new_row in enumerate(cleaned_table):
+                        new_first_val = str(new_row[0]).strip() if new_row and new_row[0] is not None else ""
+                        new_last_val = str(new_row[-1]).strip() if new_row and len(new_row) > 0 and new_row[-1] is not None else ""
+                        
+                        if new_first_val and new_last_val:
+                            # Проверяем, есть ли такое же значение в конце текущей таблицы
+                            last_current_first = get_last_non_empty_value(current_table, 0)
+                            last_current_last = get_last_non_empty_value(current_table, -1)
+                            
+                            if (new_first_val == last_current_first and 
+                                new_last_val == last_current_last):
+                                start_index = j + 1  # Пропускаем эту строку
+                            else:
+                                break
+                        else:
+                            break
+                    
+                    # Добавляем только непропущенные строки
+                    if start_index < len(cleaned_table):
+                        current_table.extend(cleaned_table[start_index:])
+            else:
+                # Это новая таблица
+                merged_tables.append(current_table)
+                current_table = cleaned_table
+    
+    # Добавляем последнюю таблицу
+    if current_table is not None:
+        merged_tables.append(current_table)
+    
+    return merged_tables
+
+def is_special_value(value):
+    """
+    Проверяет, является ли значение специальным (типа -(-), -(1) и т.д.).
+    Эти значения не должны переноситься и сужать колонки.
+    """
+    if not value:
+        return False
+    
+    # Паттерны для специальных значений
+    special_patterns = [
+        r'^\-\(.*\)$',      # -(something)
+        r'^\-\.$',          # -.
+        r'^\-$',            # -
+        r'^\.$',            # .
+        r'^\(.*\)$',        # (something)
+        r'^[\-\+]\d+$',     # -123, +456
+        r'^\d+[\-\+]$',     # 123-, 456+
+    ]
+    
+    for pattern in special_patterns:
+        if re.match(pattern, value):
+            return True
+    
+    return False
+
+def is_header_row(row):
+    """
+    Проверяет, является ли строка заголовком таблицы.
+    Заголовок - это строка, которая будет иметь классы pdf-table-header pdf-table-header-horizontal.
+    """
+    if not row:
+        return False
+    
+    # Проверяем, что это объединенная строка (одна ячейка заполнена, остальные пустые)
+    if should_merge_row(row):
+        return True
+    
+    # Дополнительные проверки для заголовков
+    first_cell = str(row[0]).strip() if row[0] is not None else ""
+    
+    header_patterns = [
+        r'таблица\s+\d+', r'table\s+\d+', r'раздел\s+\d+', r'section\s+\d+',
+        r'глава\s+\d+', r'chapter\s+\d+', r'часть\s+\d+', r'part\s+\d+'
+    ]
+    
+    for pattern in header_patterns:
+        if re.search(pattern, first_cell, re.IGNORECASE):
+            return True
+    
+    return False
+
+def split_tables_by_headers(merged_tables):
+    """
+    Разделяет объединенные таблицы по заголовкам.
+    Каждая строка-заголовок начинает новую таблицу.
+    """
+    if not merged_tables:
+        return []
+    
+    separated_tables = []
+    current_table = []
+    
+    for table in merged_tables:
+        for row in table:
+            if is_header_row(row):
+                # Если нашли заголовок и current_table не пуст, сохраняем предыдущую таблицу
+                if current_table:
+                    separated_tables.append(current_table)
+                    current_table = []
+                # Добавляем заголовок в новую таблицу
+                current_table.append(row)
+            else:
+                # Добавляем обычную строку в текущую таблицу
+                current_table.append(row)
+    
+    # Добавляем последнюю таблицу
+    if current_table:
+        separated_tables.append(current_table)
+    
+    return separated_tables
+
+def process_table_for_merging(table):
+    """Обрабатывает таблицу для объединения ячеек."""
+    if not table:
+        return []
+    
+    cleaned_table = remove_empty_rows(table)
+    if not cleaned_table:
+        return []
+    
+    processed_table = []
+    col_count = len(cleaned_table[0])
+    row_count = len(cleaned_table)
+    
+    # Обрабатываем каждую строку
+    for row_index, row in enumerate(cleaned_table):
+        processed_row = []
+        is_last_row = (row_index == row_count - 1)
+        is_header = is_header_row(row)
+        
+        if should_merge_row(row):
+            # Горизонтальное объединение для всей строки
+            cell_info = {
+                'content': row[0], 
+                'colspan': col_count,
+                'rowspan': 1,
+                'type': 'horizontal',
+                'is_last_row': is_last_row,
+                'is_special': is_special_value(str(row[0]).strip() if row[0] is not None else ""),
+                'is_header': is_header
+            }
+            processed_row.append(cell_info)
+            for i in range(1, col_count):
+                cell_info = {
+                    'content': '', 
+                    'colspan': 0, 
+                    'rowspan': 0, 
+                    'type': 'hidden',
+                    'is_last_row': is_last_row and (i == col_count - 1),
+                    'is_special': False,
+                    'is_header': False
+                }
+                processed_row.append(cell_info)
+        else:
+            # Обычная строка
+            for col_index, cell in enumerate(row):
+                cell_content = str(cell).strip() if cell is not None else ""
+                is_empty_cell = not cell_content
+                is_special = is_special_value(cell_content)
+                
+                cell_info = {
+                    'content': cell,
+                    'colspan': 1,
+                    'rowspan': 1,
+                    'type': 'empty' if is_empty_cell else 'normal',
+                    'is_last_row': is_last_row,
+                    'is_last_col': (col_index == col_count - 1),
+                    'is_special': is_special,
+                    'is_header': False
+                }
+                processed_row.append(cell_info)
+        
+        processed_table.append(processed_row)
+    
+    return processed_table
+
+def table_to_html(processed_table):
+    """Конвертирует обработанную таблицу в HTML."""
+    if not processed_table:
+        return ""
+    
+    html_table = '<table class="pdf-table" cellpadding="5" cellspacing="0">\n'
+    
+    for row_index, row in enumerate(processed_table):
+        html_table += "  <tr>\n"
+        
+        for cell_info in row:
+            content = cell_info['content']
+            colspan = cell_info['colspan']
+            rowspan = cell_info['rowspan']
+            cell_type = cell_info['type']
+            is_last_row = cell_info.get('is_last_row', False)
+            is_last_col = cell_info.get('is_last_col', False)
+            is_special = cell_info.get('is_special', False)
+            is_header = cell_info.get('is_header', False)
+            
+            # Экранируем HTML символы
+            cell_text = str(content) if content is not None else ""
+            cell_text = html.escape(cell_text)
+            
+            if colspan == 0 and rowspan == 0:
+                continue
+            
+            # Определяем CSS классы
+            classes = ["pdf-table-cell"]
+            
+            if cell_type == 'horizontal' or is_header:
+                classes.extend(["pdf-table-header", "pdf-table-header-horizontal"])
+            elif cell_type == 'empty':
+                classes.append("pdf-table-cell-empty")
+            if is_special:
+                classes.append("pdf-table-cell-special")
+            
+            # Определяем стили границ
+            border_styles = []
+            
+            if cell_type in ['normal', 'horizontal'] or is_header:
+                border_styles.append("border-top: 1px solid #ddd")
+                border_styles.append("border-bottom: none")
+            else:
+                border_styles.append("border: none")
+            
+            border_styles.append("border-left: 1px solid #ddd")
+            border_styles.append("border-right: 1px solid #ddd")
+            
+            if is_last_row:
+                border_styles.append("border-bottom: 1px solid #ddd")
+            
+            # Собираем атрибуты
+            colspan_attr = f' colspan="{colspan}"' if colspan > 1 else ''
+            rowspan_attr = f' rowspan="{rowspan}"' if rowspan > 1 else ''
+            class_attr = f' class="{" ".join(classes)}"'
+            style_attr = f' style="{"; ".join(border_styles)}"'
+            
+            html_table += f'    <td{colspan_attr}{rowspan_attr}{class_attr}{style_attr}>{cell_text}</td>\n'
+        
+        html_table += "  </tr>\n"
+    
+    html_table += "</table>"
+    return html_table
+
+def extract_tables_from_pdf(pdf_path, start_page, end_page):
+    """Извлекает таблицы из PDF с умным объединением страниц."""
+    all_page_tables = []
+    
+    with pdfplumber.open(pdf_path) as pdf:
+        if end_page >= len(pdf.pages):
+            end_page = len(pdf.pages) - 1
+        
+        print(f"Обрабатываются страницы с {start_page + 1} по {end_page + 1}")
+        
+        for page_num in range(start_page, end_page + 1):
+            page = pdf.pages[page_num]
+            tables = page.extract_tables()
+            
+            if tables:
+                print(f"На странице {page_num + 1} найдено {len(tables)} таблиц")
+                
+                for table in tables:
+                    cleaned_table = remove_empty_rows(table)
+                    if cleaned_table:
+                        all_page_tables.append(cleaned_table)
+            else:
+                print(f"На странице {page_num + 1} таблиц не найдено")
+    
+    if not all_page_tables:
+        print("Не найдено ни одной таблицы для обработки")
+        return ""
+    
+    # Умное объединение таблиц с учетом ОГРН продолжений
+    merged_tables = merge_ogrn_continuation_tables(all_page_tables)
+    
+    # Разделяем таблицы по заголовкам
+    separated_tables = split_tables_by_headers(merged_tables)
+    
+    print(f"После объединения и разделения по заголовкам получено {len(separated_tables)} таблиц")
+    
+    # Конвертируем каждую таблицу в HTML
+    html_tables = []
+    for table in separated_tables:
+        processed_table = process_table_for_merging(table)
+        if processed_table:
+            html_table = table_to_html(processed_table)
+            html_tables.append(html_table)
+    
+    return '\n'.join(html_tables)
+
+def save_tables_to_html(tables_html, output_html_path, full_html=True, title="Таблицы из PDF"):
+    """Сохраняет HTML-код таблиц в файл."""
+    if full_html:
+        full_html_content = f"""
+        <!DOCTYPE html>
+        <html lang="ru">
+        <head>
+            <meta charset="UTF-8">
+            <title>{title}</title>
+            <style>
+                .pdf-table {{
+                    border-collapse: collapse;
+                    width: 100%;
+                    margin-bottom: 30px;
+                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+                    background-color: white;
+                    border: 1px solid #ddd;
+                    table-layout: auto;
+                }}
+                .pdf-table-cell {{
+                    padding: 10px;
+                    text-align: left;
+                    background-color: white;
+                    word-wrap: break-word;
+                }}
+                .pdf-table-header {{
+                    font-weight: bold;
+                    background-color: #f8f9fa;
+                }}
+                .pdf-table-header-horizontal {{
+                    text-align: center;
+                    font-size: 1.1em;
+                    background-color: #e8f4f8;
+                }}
+                .pdf-table-cell-empty {{
+                    background-color: white;
+                }}
+                /* Стиль для специальных значений - запрет переноса */
+                .pdf-table-cell-special {{
+                    white-space: nowrap;
+                    min-width: 30px;
+                }}
+                .pdf-table tr:hover .pdf-table-cell {{
+                    background-color: #f5f5f5;
+                }}
+                .pdf-table tr:hover .pdf-table-header {{
+                    background-color: #e0e0e0;
+                }}
+            </style>
+        </head>
+        <body>
+            <h1>{title}</h1>
+            {tables_html}
+        </body>
+        </html>
+        """
+    else:
+        full_html_content = tables_html
+    
+    with open(output_html_path, "w", encoding="utf-8") as f:
+        f.write(full_html_content)
+    print(f"HTML файл сохранен как: {output_html_path}")
+
+def extract_tables_to_html(pdf_path, start_page, end_page, output_html_path, full_html=True):
+    """Полная функция для извлечения таблиц и сохранения в HTML."""
+    tables_html = extract_tables_from_pdf(pdf_path, start_page, end_page)
+    if tables_html:
+        save_tables_to_html(tables_html, output_html_path, full_html)
+    else:
+        print("Нечего сохранять - таблицы не найдены")
+
+# Пример использования
+if __name__ == "__main__":
+    extract_tables_to_html("1.pdf", 5, 500, "1.html")