table_pdf/table.py

import pdfplumber
import re
import html

def is_empty_row(row):
    """Проверяет, является ли строка полностью пустой."""
    if not row:
        return True
    for cell in row:
        cell_content = str(cell).strip() if cell is not None else ""
        if cell_content:
            return False
    return True

def remove_empty_rows(table):
    """Удаляет полностью пустые строки из таблицы."""
    if not table:
        return []
    return [row for row in table if not is_empty_row(row)]

def should_merge_row(row):
    """Проверяет, нужно ли объединять ячейки в строке по горизонтали."""
    if not row:
        return False
    first_cell = str(row[0]).strip() if row[0] is not None else ""
    if not first_cell:
        return False
    for cell in row[1:]:
        cell_content = str(cell).strip() if cell is not None else ""
        if cell_content:
            return False
    return True

def is_ogrn_continuation(cell_value):
    """
    Проверяет, является ли значение продолжением ОГРН записи.
    Возвращает True, если значение начинается с 'ОГРН:'.
    """
    if not cell_value:
        return False

    cell_str = str(cell_value).strip()
    return cell_str.startswith('ОГРН:') or cell_str.startswith('ОГРН :')

def get_last_non_empty_value(table, column_index):
    """
    Возвращает последнее непустое значение в указанной колонке таблицы.
    Ищет снизу вверх.
    """
    if not table:
        return None

    for row in reversed(table):
        if column_index < len(row):
            cell_value = str(row[column_index]).strip() if row[column_index] is not None else ""
            if cell_value:
                return cell_value
    return None

def get_first_non_empty_value(table, column_index):
    """
    Возвращает первое непустое значение в указанной колонке таблицы.
    Ищет сверху вниз.
    """
    if not table:
        return None

    for row in table:
        if column_index < len(row):
            cell_value = str(row[column_index]).strip() if row[column_index] is not None else ""
            if cell_value:
                return cell_value
    return None

def is_pagination_continuation(prev_table, new_table):
    """
    Проверяет, является ли новая таблица продолжением предыдущей из-за пагинации.
    Новое условие: если в первой колонке разные значения, но в последней колонке пусто,
    то это общее значение, которое съехало на вторую страницу.
    """
    if not prev_table or not new_table:
        return False

    # Получаем последние значения из предыдущей таблицы
    prev_first_col = get_last_non_empty_value(prev_table, 0)
    prev_last_col = get_last_non_empty_value(prev_table, -1)

    # Получаем первые значения из новой таблицы
    new_first_col = get_first_non_empty_value(new_table, 0)
    new_last_col = get_first_non_empty_value(new_table, -1)

    # Новое условие: разные значения в первой колонке И пустая последняя колонка в новой таблице
    if (prev_first_col and new_first_col and
        prev_first_col != new_first_col and
        not new_last_col):
        return True

    return False

def is_continuation_table(prev_table, new_table):
    """
    Проверяет, является ли новая таблица продолжением предыдущей.
    Сравнивает последние непустые значения первой и последней колонок.
    """
    if not prev_table or not new_table:
        return False

    # Если первая строка новой таблицы содержит ОГРН продолжение - это продолжение
    if new_table and new_table[0] and is_ogrn_continuation(new_table[0][0]):
        return True

    # Проверяем пагинационное продолжение (новое условие)
    if is_pagination_continuation(prev_table, new_table):
        return True

    # Получаем последние непустые значения из предыдущей таблицы
    prev_first_col_value = get_last_non_empty_value(prev_table, 0)
    prev_last_col_value = get_last_non_empty_value(prev_table, -1)

    # Получаем первые непустые значения из новой таблиции
    new_first_col_value = get_first_non_empty_value(new_table, 0)
    new_last_col_value = get_first_non_empty_value(new_table, -1)

    # Если оба значения совпадают и не пустые - это продолжение
    if (prev_first_col_value and new_first_col_value and
        prev_first_col_value == new_first_col_value and
        prev_last_col_value and new_last_col_value and
        prev_last_col_value == new_last_col_value):
        return True

    return False

def merge_pagination_continuation(prev_table, new_table):
    """
    Объединяет таблицы при пагинационном продолжении.
    Склеивает общее значение, которое съехало на вторую страницу.
    """
    if not prev_table or not new_table:
        return prev_table

    # Получаем последнее значение из предыдущей таблицы (общее значение)
    last_prev_value = get_last_non_empty_value(prev_table, 0)

    if not last_prev_value:
        return prev_table

    # Находим индекс последней непустой строки в предыдущей таблице
    last_row_index = -1
    for i in range(len(prev_table) - 1, -1, -1):
        if not is_empty_row(prev_table[i]):
            last_row_index = i
            break

    if last_row_index < 0:
        return prev_table

    # Объединяем значения: добавляем первое значение новой таблицы к последнему значению предыдущей
    first_new_value = get_first_non_empty_value(new_table, 0)
    if first_new_value:
        # Объединяем значения через пробел
        merged_value = f"{last_prev_value} {first_new_value}"

        # Заменяем значение в предыдущей таблице
        prev_table[last_row_index] = [merged_value] + list(prev_table[last_row_index][1:])

        # Удаляем первую строку из новой таблицы (так как мы её уже объединили)
        if len(new_table) > 1:
            return prev_table + new_table[1:]
        else:
            return prev_table

    return prev_table + new_table

def merge_ogrn_continuation_tables(all_tables):
    """
    Объединяет таблицы, где первая строка содержит продолжение ОГРН.
    """
    if not all_tables:
        return []

    merged_tables = []
    current_table = None

    for i, table in enumerate(all_tables):
        if not table:
            continue

        cleaned_table = remove_empty_rows(table)
        if not cleaned_table:
            continue

        if current_table is None:
            current_table = cleaned_table
            continue

        # Проверяем, является ли первая строка продолжением ОГРН
        first_row = cleaned_table[0] if cleaned_table else []
        first_cell = first_row[0] if first_row and len(first_row) > 0 else ""

        if is_ogrn_continuation(first_cell):
            # Это продолжение ОГРН - объединяем с предыдущей таблицей
            print(f"Таблица {i+1} содержит продолжение ОГРН - объединяем")

            # Находим последнюю непустую строку в текущей таблице
            last_row_index = -1
            for j in range(len(current_table) - 1, -1, -1):
                if not is_empty_row(current_table[j]):
                    last_row_index = j
                    break

            if last_row_index >= 0:
                # Объединяем ОГРН значение с последней строкой
                last_row = current_table[last_row_index]
                if len(last_row) > 0:
                    # Добавляем ОГРН значение к последней ячейке первой колонки
                    ogrn_value = str(first_cell).strip()
                    last_row_value = str(last_row[0]).strip() if last_row[0] is not None else ""

                    if last_row_value:
                        # Объединяем значения
                        merged_value = f"{last_row_value} {ogrn_value}"
                        current_table[last_row_index] = [merged_value] + list(last_row[1:])

                    # Добавляем остальные строки из новой таблицы (кроме первой)
                    if len(cleaned_table) > 1:
                        current_table.extend(cleaned_table[1:])
                else:
                    current_table.extend(cleaned_table)
            else:
                current_table.extend(cleaned_table)
        else:
            # Проверяем обычное продолжение таблицы
            if is_continuation_table(current_table, cleaned_table):
                print(f"Таблица {i+1} является продолжением - объединяем")

                # Проверяем специальный случай пагинационного продолжения
                if is_pagination_continuation(current_table, cleaned_table):
                    current_table = merge_pagination_continuation(current_table, cleaned_table)
                else:
                    # Находим с какой строки начинать добавление (пропускаем дублирующиеся значения)
                    start_index = 0
                    for j, new_row in enumerate(cleaned_table):
                        new_first_val = str(new_row[0]).strip() if new_row and new_row[0] is not None else ""
                        new_last_val = str(new_row[-1]).strip() if new_row and len(new_row) > 0 and new_row[-1] is not None else ""

                        if new_first_val and new_last_val:
                            # Проверяем, есть ли такое же значение в конце текущей таблицы
                            last_current_first = get_last_non_empty_value(current_table, 0)
                            last_current_last = get_last_non_empty_value(current_table, -1)

                            if (new_first_val == last_current_first and
                                new_last_val == last_current_last):
                                start_index = j + 1  # Пропускаем эту строку
                            else:
                                break
                        else:
                            break

                    # Добавляем только непропущенные строки
                    if start_index < len(cleaned_table):
                        current_table.extend(cleaned_table[start_index:])
            else:
                # Это новая таблица
                merged_tables.append(current_table)
                current_table = cleaned_table

    # Добавляем последнюю таблицу
    if current_table is not None:
        merged_tables.append(current_table)

    return merged_tables

def is_special_value(value):
    """
    Проверяет, является ли значение специальным (типа -(-), -(1) и т.д.).
    Эти значения не должны переноситься и сужать колонки.
    """
    if not value:
        return False

    # Паттерны для специальных значений
    special_patterns = [
        r'^\-\(.*\)$',      # -(something)
        r'^\-\.$',          # -.
        r'^\-$',            # -
        r'^\.$',            # .
        r'^\(.*\)$',        # (something)
        r'^[\-\+]\d+$',     # -123, +456
        r'^\d+[\-\+]$',     # 123-, 456+
    ]

    for pattern in special_patterns:
        if re.match(pattern, value):
            return True

    return False

def is_header_row(row):
    """
    Проверяет, является ли строка заголовком таблицы.
    Заголовок - это строка, которая будет иметь классы pdf-table-header pdf-table-header-horizontal.
    """
    if not row:
        return False

    # Проверяем, что это объединенная строка (одна ячейка заполнена, остальные пустые)
    if should_merge_row(row):
        return True

    # Дополнительные проверки для заголовков
    first_cell = str(row[0]).strip() if row[0] is not None else ""

    header_patterns = [
        r'таблица\s+\d+', r'table\s+\d+', r'раздел\s+\d+', r'section\s+\d+',
        r'глава\s+\d+', r'chapter\s+\d+', r'часть\s+\d+', r'part\s+\d+'
    ]

    for pattern in header_patterns:
        if re.search(pattern, first_cell, re.IGNORECASE):
            return True

    return False

def split_tables_by_headers(merged_tables):
    """
    Разделяет объединенные таблицы по заголовкам.
    Каждая строка-заголовок начинает новую таблицу.
    """
    if not merged_tables:
        return []

    separated_tables = []
    current_table = []

    for table in merged_tables:
        for row in table:
            if is_header_row(row):
                # Если нашли заголовок и current_table не пуст, сохраняем предыдущую таблицу
                if current_table:
                    separated_tables.append(current_table)
                    current_table = []
                # Добавляем заголовок в новую таблицу
                current_table.append(row)
            else:
                # Добавляем обычную строку в текущую таблицу
                current_table.append(row)

    # Добавляем последнюю таблицу
    if current_table:
        separated_tables.append(current_table)

    return separated_tables

def process_table_for_merging(table):
    """Обрабатывает таблицу для объединения ячеек."""
    if not table:
        return []

    cleaned_table = remove_empty_rows(table)
    if not cleaned_table:
        return []

    processed_table = []
    col_count = len(cleaned_table[0])
    row_count = len(cleaned_table)

    # Обрабатываем каждую строку
    for row_index, row in enumerate(cleaned_table):
        processed_row = []
        is_last_row = (row_index == row_count - 1)
        is_header = is_header_row(row)

        if should_merge_row(row):
            # Горизонтальное объединение для всей строки
            cell_info = {
                'content': row[0],
                'colspan': col_count,
                'rowspan': 1,
                'type': 'horizontal',
                'is_last_row': is_last_row,
                'is_special': is_special_value(str(row[0]).strip() if row[0] is not None else ""),
                'is_header': is_header
            }
            processed_row.append(cell_info)
            for i in range(1, col_count):
                cell_info = {
                    'content': '',
                    'colspan': 0,
                    'rowspan': 0,
                    'type': 'hidden',
                    'is_last_row': is_last_row and (i == col_count - 1),
                    'is_special': False,
                    'is_header': False
                }
                processed_row.append(cell_info)
        else:
            # Обычная строка
            for col_index, cell in enumerate(row):
                cell_content = str(cell).strip() if cell is not None else ""
                is_empty_cell = not cell_content
                is_special = is_special_value(cell_content)

                cell_info = {
                    'content': cell,
                    'colspan': 1,
                    'rowspan': 1,
                    'type': 'empty' if is_empty_cell else 'normal',
                    'is_last_row': is_last_row,
                    'is_last_col': (col_index == col_count - 1),
                    'is_special': is_special,
                    'is_header': False
                }
                processed_row.append(cell_info)

        processed_table.append(processed_row)

    return processed_table

def table_to_html(processed_table):
    """Конвертирует обработанную таблицу в HTML."""
    if not processed_table:
        return ""

    html_table = '<table class="pdf-table" cellpadding="5" cellspacing="0">\n'

    for row_index, row in enumerate(processed_table):
        html_table += "  <tr>\n"

        for cell_info in row:
            content = cell_info['content']
            colspan = cell_info['colspan']
            rowspan = cell_info['rowspan']
            cell_type = cell_info['type']
            is_last_row = cell_info.get('is_last_row', False)
            is_last_col = cell_info.get('is_last_col', False)
            is_special = cell_info.get('is_special', False)
            is_header = cell_info.get('is_header', False)

            # Экранируем HTML символы
            cell_text = str(content) if content is not None else ""
            cell_text = html.escape(cell_text)

            if colspan == 0 and rowspan == 0:
                continue

            # Определяем CSS классы
            classes = ["pdf-table-cell"]

            if cell_type == 'horizontal' or is_header:
                classes.extend(["pdf-table-header", "pdf-table-header-horizontal"])
            elif cell_type == 'empty':
                classes.append("pdf-table-cell-empty")
            if is_special:
                classes.append("pdf-table-cell-special")

            # Определяем стили границ
            border_styles = []

            if cell_type in ['normal', 'horizontal'] or is_header:
                border_styles.append("border-top: 1px solid #ddd")
                border_styles.append("border-bottom: none")
            else:
                border_styles.append("border: none")

            border_styles.append("border-left: 1px solid #ddd")
            border_styles.append("border-right: 1px solid #ddd")

            if is_last_row:
                border_styles.append("border-bottom: 1px solid #ddd")

            # Собираем атрибуты
            colspan_attr = f' colspan="{colspan}"' if colspan > 1 else ''
            rowspan_attr = f' rowspan="{rowspan}"' if rowspan > 1 else ''
            class_attr = f' class="{" ".join(classes)}"'
            style_attr = f' style="{"; ".join(border_styles)}"'

            html_table += f'    <td{colspan_attr}{rowspan_attr}{class_attr}{style_attr}>{cell_text}</td>\n'

        html_table += "  </tr>\n"

    html_table += "</table>"
    return html_table

def extract_tables_from_pdf(pdf_path, start_page, end_page):
    """Извлекает таблицы из PDF с умным объединением страниц."""
    all_page_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        if end_page >= len(pdf.pages):
            end_page = len(pdf.pages) - 1

        print(f"Обрабатываются страницы с {start_page + 1} по {end_page + 1}")

        for page_num in range(start_page, end_page + 1):
            page = pdf.pages[page_num]
            tables = page.extract_tables()

            if tables:
                print(f"На странице {page_num + 1} найдено {len(tables)} таблиц")

                for table in tables:
                    cleaned_table = remove_empty_rows(table)
                    if cleaned_table:
                        all_page_tables.append(cleaned_table)
            else:
                print(f"На странице {page_num + 1} таблиц не найдено")

    if not all_page_tables:
        print("Не найдено ни одной таблицы для обработки")
        return ""

    # Умное объединение таблиц с учетом ОГРН продолжений
    merged_tables = merge_ogrn_continuation_tables(all_page_tables)

    # Разделяем таблицы по заголовкам
    separated_tables = split_tables_by_headers(merged_tables)

    print(f"После объединения и разделения по заголовкам получено {len(separated_tables)} таблиц")

    # Конвертируем каждую таблицу в HTML
    html_tables = []
    for table in separated_tables:
        processed_table = process_table_for_merging(table)
        if processed_table:
            html_table = table_to_html(processed_table)
            html_tables.append(html_table)

    return '\n'.join(html_tables)

def save_tables_to_html(tables_html, output_html_path, full_html=True, title="Таблицы из PDF"):
    """Сохраняет HTML-код таблиц в файл."""
    if full_html:
        full_html_content = f"""
        <!DOCTYPE html>
        <html lang="ru">
        <head>
            <meta charset="UTF-8">
            <title>{title}</title>
            <style>
                .pdf-table {{
                    border-collapse: collapse;
                    width: 100%;
                    margin-bottom: 30px;
                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
                    background-color: white;
                    border: 1px solid #ddd;
                    table-layout: auto;
                }}
                .pdf-table-cell {{
                    padding: 10px;
                    text-align: left;
                    background-color: white;
                    word-wrap: break-word;
                }}
                .pdf-table-header {{
                    font-weight: bold;
                    background-color: #f8f9fa;
                }}
                .pdf-table-header-horizontal {{
                    text-align: center;
                    font-size: 1.1em;
                    background-color: #e8f4f8;
                }}
                .pdf-table-cell-empty {{
                    background-color: white;
                }}
                /* Стиль для специальных значений - запрет переноса */
                .pdf-table-cell-special {{
                    white-space: nowrap;
                    min-width: 30px;
                }}
                .pdf-table tr:hover .pdf-table-cell {{
                    background-color: #f5f5f5;
                }}
                .pdf-table tr:hover .pdf-table-header {{
                    background-color: #e0e0e0;
                }}
            </style>
        </head>
        <body>
            <h1>{title}</h1>
            {tables_html}
        </body>
        </html>
        """
    else:
        full_html_content = tables_html

    with open(output_html_path, "w", encoding="utf-8") as f:
        f.write(full_html_content)
    print(f"HTML файл сохранен как: {output_html_path}")

def extract_tables_to_html(pdf_path, start_page, end_page, output_html_path, full_html=True):
    """Полная функция для извлечения таблиц и сохранения в HTML."""
    tables_html = extract_tables_from_pdf(pdf_path, start_page, end_page)
    if tables_html:
        save_tables_to_html(tables_html, output_html_path, full_html)
    else:
        print("Нечего сохранять - таблицы не найдены")

# Пример использования
if __name__ == "__main__":
    extract_tables_to_html("1.pdf", 5, 500, "1.html")