<?php

namespace App\Services;

use Illuminate\Support\Facades\Log;

class VoterBoxParser
{
    public function parseVoterBox(string $text): array
    {
        // Clean up noisy text that can interfere with parsing.
        $text = preg_replace('/Photo\s+Available/i', '', $text);
        $text = preg_replace('/Age as on\s+\d{2}-\d{2}-\d{4}/i', '', $text);
        $text = preg_replace('/#\s*\/\s*#\d+/i', '', $text);
        $text = preg_replace('/Modified as per supplement.*/i', '', $text);
        $text = preg_replace('/Total Pages \d+ - Page \d+/i', '', $text);

        // Attempt to extract voter IDs from lines, choose the most plausible within the box.
        $allIdMatches = [];
        $rawLines = preg_split('/\r?\n/', trim($text));
        foreach ($rawLines as $idx => $rl) {
            if (preg_match_all('/\b([A-Z]{3}\d{7})\b/', $rl, $m)) {
                foreach ($m[1] as $idTok) {
                    $allIdMatches[] = ['id' => $idTok, 'line' => $idx, 'text' => trim($rl)];
                }
            }
        }
        // Improved heuristic:
        // 1) Prefer early lines (<=5) because ID is top-right.
        // 2) Prefer lines where the ID is the ONLY token (no labels/words).
        // 3) Avoid lines containing known field labels (Name, Father, Husband, Mother, House, Age, Gender, Deleted).
        // 4) Fallback to the earliest occurrence.
        $voterId = null;
        if (!empty($allIdMatches)) {
            $labelPattern = '/(Name|Father|Husband|Mother|Other|House|Age|Gender|Deleted)/i';
            $candidates = array_filter($allIdMatches, function($e) use ($labelPattern) {
                $t = $e['text'];
                if (preg_match($labelPattern, $t)) return false; // has labels
                // Only token: the entire line is the ID
                return preg_match('/^\s*[A-Z]{3}\d{7}\s*$/', $t) === 1;
            });
            // Prefer early candidates
            $topCandidates = array_filter($candidates, fn($e) => $e['line'] <= 5);
            if (!empty($topCandidates)) {
                usort($topCandidates, fn($a,$b) => $a['line'] <=> $b['line']);
                $voterId = $topCandidates[0]['id'];
            } elseif (!empty($candidates)) {
                usort($candidates, fn($a,$b) => $a['line'] <=> $b['line']);
                $voterId = $candidates[0]['id'];
            } else {
                // Fallback to earliest match overall
                usort($allIdMatches, fn($a,$b) => $a['line'] <=> $b['line']);
                $voterId = $allIdMatches[0]['id'];
            }
        }

        $lines = explode("\n", trim($text));
        $lines = array_filter(array_map('trim', $lines));
        
        $data = [
            'voter_id_number' => $voterId,
            'serial_number' => null,
            // raw_serial_line keeps original OCR token so we can validate later (e.g. contains 'S')
            'raw_serial_line' => null,
            'name' => null,
            'relation_type' => null,
            'relation_name' => null,
            'house_number' => null,
            'age' => null,
            'gender' => null,
            'is_deleted' => false,
        ];

        // Process remaining lines for other details.
        foreach ($lines as $key => $line) {
            // Serial number detection: allow small token of up to 6 chars (digits or letters) and record raw token.
            if ($data['serial_number'] === null && preg_match('/^\s*([0-9A-Za-z]{1,6})\s*$/', $line, $sm)) {
                $rawToken = strtoupper($sm[1]);
                $data['raw_serial_line'] = $rawToken;
                // If token contains 'S' treat as invalid serial (will be ignored upstream)
                if (str_contains($rawToken, 'S')) {
                    continue; // do not assign serial_number
                }
                // Accept only pure digits 1-4 length as valid serial
                if (preg_match('/^\d{1,4}$/', $rawToken)) {
                    $candidate = (int)$rawToken;
                    if ($candidate > 0 && $candidate <= 4000) {
                        $data['serial_number'] = $candidate;
                        continue;
                    }
                }
                // Non-pure-digit tokens are ignored
                continue;
            }
            // Match voter name line explicitly (avoid relation name lines).
            if (empty($data['name']) && preg_match('/^Name\s*:\s*(.+)$/i', $line, $matches)) {
                $data['name'] = $this->cleanName($matches[1]);
            }

            // Relation details
            if (empty($data['relation_name']) && preg_match('/(Father|Husband|Mother|Other)\s*Name\s*:\s*(.+)/i', $line, $matches)) {
                $data['relation_type'] = strtolower($matches[1]);
                $data['relation_name'] = $this->cleanName($matches[2]);
            }

            // House number
            if (preg_match('/House\s*Number\s*:\s*(.+)/i', $line, $matches)) {
                $data['house_number'] = trim($matches[1]);
            }

            // Age
            if (preg_match('/Age\s*:\s*(\d+)/i', $line, $matches)) {
                $data['age'] = (int)$matches[1];
            }

            // Gender
            if (preg_match('/Gender\s*:\s*(Male|Female)/i', $line, $matches)) {
                $data['gender'] = strtolower($matches[1]);
            }
        }

        // If name is still not found, handle cases without colon (e.g., "Name KIRAN")
        if (empty($data['name'])) {
            foreach ($lines as $line) {
                if (preg_match('/^Name\s+(.+)/i', $line, $matches)) {
                    $data['name'] = $this->cleanName($matches[1]);
                    break;
                }
            }
        }

        // If name is STILL not found, assume it's the first line that doesn't match other patterns.
        if (empty($data['name'])) {
            foreach ($lines as $line) {
                if (!preg_match('/(Name|Father|Husband|Mother|Other|House|Age|Gender)/i', $line)) {
                    $cleanedLine = $this->cleanName($line);
                    if (strlen($cleanedLine) > 2 && !is_numeric($cleanedLine)) {
                        $data['name'] = $cleanedLine;
                        break;
                    }
                }
            }
        }


        // Final check for deleted status
        if (preg_match('/DELETED/i', $text)) {
            $data['is_deleted'] = true;
        }

        // Fallback: infer gender if label was missed but keyword exists
        if (empty($data['gender'])) {
            if (preg_match('/\b(Male|Female)\b/i', $text, $m)) {
                $data['gender'] = strtolower($m[1]);
            }
        }

        return $data;
    }

    private function cleanName(string $name): string
    {
        $name = preg_replace('/^Name\s*:/i', '', $name);
        return trim(preg_replace('/[^a-zA-Z\s]/', '', $name));
    }

    private function normalizeRelationType(string $type): string
    {
        $type = strtolower($type);
        if ($type === 'father') return 'F';
        if ($type === 'husband') return 'H';
        if ($type === 'mother') return 'M';
        return 'O';
    }

    private function normalizeGender(string $gender): string
    {
        $gender = strtolower(trim($gender));
        if ($gender === 'male') return 'male';
        if ($gender === 'female') return 'female';
        return 'other';
    }
}
