from typing import List, Optional, Dict
from datetime import datetime, timedelta, timezone
import re
from core.models import Event, Outcome, ArbOpportunity, MiddleOpportunity
from fuzzywuzzy import fuzz
import config
import logging

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Event category detection (prevents cross-gender / cross-age-group matches)
# ---------------------------------------------------------------------------

_WOMENS_WORDS = frozenset(['women', "women's", 'woman', 'ladies', 'lady', 'female', 'girls', 'dames', 'damen'])
_YOUTH_RE    = re.compile(r'\bu(1[5-9]|2[0-3])\b')          # U15–U23
_RESERVE_RE  = re.compile(r'\s+(b|ii|2|reserves?)\s*$', re.I) # B-team / reserves
_DOUBLES_RE  = re.compile(r'\b(doubles?|dbl|dbs|mixed)\b', re.I)  # "Doubles/DBL/Mixed" in league or name


def _event_category(event: Event):
    """Return (is_womens, youth_group, is_reserve) derived from league + team names."""
    text = f"{event.league} {event.home_team} {event.away_team}".lower()
    is_womens = any(w in text.split() or w in text for w in _WOMENS_WORDS)
    m = _YOUTH_RE.search(text)
    is_reserve = bool(
        _RESERVE_RE.search(event.home_team) or
        _RESERVE_RE.search(event.away_team)
    )
    return (is_womens, m.group(0) if m else None, is_reserve)


def _is_doubles(event: Event) -> bool:
    """Return True if the event is a doubles match.

    Detection layers (in order):
    1. League name contains 'doubles', 'dbl', 'mixed', etc.
    2. Either team name contains '/' (standard doubles separator).
    3. Either team name contains 'doubles'/'dbl' keyword.
    4. Tennis-only: either name has 4+ tokens — providers like Sportradar/Bangbet
       sometimes omit the '/' between partners, e.g. 'Kennedy Johnson' becomes
       'Kennedy Smith Jones Williams' (both full names concatenated).
    """
    if _DOUBLES_RE.search(event.league):
        return True
    for name in (event.home_team, event.away_team):
        if '/' in name:
            return True
        if _DOUBLES_RE.search(name):
            return True
        if event.sport == 'tennis':
            tokens = re.sub(r'[^a-z0-9]', ' ', name.lower()).split()
            if len(tokens) >= 4:
                return True
    return False


def _same_category(a: Event, b: Event) -> bool:
    """Return False if the two events are from different gender/age-group/reserve categories."""
    if _event_category(a) != _event_category(b):
        return False
    # Block singles vs doubles cross-matching in tennis
    if a.sport == 'tennis' and _is_doubles(a) != _is_doubles(b):
        return False
    return True


# ---------------------------------------------------------------------------
# Arbitrage maths
# ---------------------------------------------------------------------------

def arb_margin(odds: List[float]) -> Optional[float]:
    """Return profit % if arbitrage exists, else None.

    Rejects inverse sums below 0.85: a legitimate two-outcome market always
    sits above 1.0 (bookmaker margin); dropping to 0.85 signals that the
    two outcomes are not truly complementary (same side, wrong event, or
    misread handicap line) rather than genuine mispricing.
    """
    inverse_sum = sum(1.0 / o for o in odds)
    if inverse_sum < 0.85:
        return None  # impossible for a valid market — data/matching error
    if inverse_sum < 1.0:
        return ((1.0 / inverse_sum) - 1.0) * 100
    return None


def optimal_stakes(total_stake: float, outcomes: List[Outcome]) -> List[dict]:
    """
    Calculate stakes that guarantee the same return regardless of result.

    stake_i = total_stake / (odds_i * sum(1/odds_j))
    guaranteed_return = total_stake / sum(1/odds_j)
    """
    inv_sum = sum(1.0 / o.odds for o in outcomes)
    guaranteed = total_stake / inv_sum

    result = []
    for o in outcomes:
        stake = guaranteed / o.odds
        result.append({
            'bookmaker':  o.bookmaker,
            'outcome':    o.name,
            'odds':       o.odds,
            'stake':      round(stake, 2),
            'potential_return': round(stake * o.odds, 2),
            'event_url':  o.event_url,
            'event_id':   o.event_id,
        })
    return result


# ---------------------------------------------------------------------------
# Event matching across bookmakers
# ---------------------------------------------------------------------------

def _times_close(t1: Optional[datetime], t2: Optional[datetime],
                 require_known: bool = False) -> bool:
    """Return True if t1 and t2 are within TIME_TOLERANCE_MINUTES of each other.

    If require_known=True (used for fuzzy matches), both times must be non-None;
    a missing time on either side is treated as a mismatch so we don't silently
    merge unrelated events just because one bookmaker omitted the kick-off.
    """
    if t1 is None or t2 is None:
        return not require_known   # strict: reject; lenient: allow
    # Strip timezone info so naive and aware datetimes can be compared
    t1 = t1.replace(tzinfo=None)
    t2 = t2.replace(tzinfo=None)
    diff = abs((t1 - t2).total_seconds()) / 60
    return diff <= config.TIME_TOLERANCE_MINUTES


def _leagues_compatible(a: Event, b: Event) -> bool:
    """Return False if both events have non-empty league names that are clearly
    different competitions (token_set_ratio < 65).

    token_set_ratio scores subset containment, so country-prefixed variants
    always score 100: 'La Liga' vs 'Spain - La Liga', 'Premier League' vs
    'English Premier League', 'UAE - Division 1' vs 'Division 1'.
    Genuinely different divisions ('Division de Honor' vs 'División Intermedia')
    score ~57 and are blocked.
    """
    la = a.league.strip()
    lb = b.league.strip()
    if not la or not lb:
        return True   # one or both leagues unknown — cannot rule out a match
    return fuzz.token_set_ratio(la, lb) >= 65


def _teams_match(a: Event, b: Event) -> bool:
    """Fuzzy name match comparing individual team names to avoid score inflation
    from shared opponents (e.g. 'Latvia U19 vs Belgium U19' must not match
    'Slovenia U19 vs Belgium U19' just because Belgium U19 dominates the ratio).
    token_set_ratio handles prefixes: 'AC Milan'/'Milan', 'Borussia Dortmund'/'Dortmund'.
    """
    _, t1a, t2a = a.match_key.split(':', 2)
    _, t1b, t2b = b.match_key.split(':', 2)
    # Teams are pre-sorted in match_key; try both pairings and take the best
    s_normal  = min(fuzz.token_set_ratio(t1a, t1b), fuzz.token_set_ratio(t2a, t2b))
    s_reverse = min(fuzz.token_set_ratio(t1a, t2b), fuzz.token_set_ratio(t2a, t1b))
    return max(s_normal, s_reverse) >= 80


def _is_teams_reversed(ref: Event, other: Event) -> bool:
    """Return True if other lists the teams in the opposite home/away order vs ref.

    Compares raw team names (not the sorted match_key) to detect when one
    bookmaker has 'A vs B' and another has 'B vs A' for the same fixture.
    """
    # Fast-path: compare normalised names (strips FC/SC/AC suffixes).
    # "Tochigi SC" → "tochigi", "Tochigi Uva FC" → "tochigi uva" — these
    # are exact-match distinguishable even though token_set_ratio scores 100
    # for all pairings when a city name dominates.
    rh_n = Event._norm(ref.home_team)
    ra_n = Event._norm(ref.away_team)
    oh_n = Event._norm(other.home_team)
    oa_n = Event._norm(other.away_team)

    if rh_n == oh_n and ra_n == oa_n:
        return False   # confirmed same order
    if rh_n == oa_n and ra_n == oh_n:
        return True    # confirmed reversed

    # Fuzzy fallback for name variants like "AFC Ajax" / "Ajax Amsterdam"
    # where _norm strips prefix/suffix leaving strings that don't exactly match.
    # Use raw token_set_ratio here (not the capped _tsr used in _teams_match).
    # The cap guards against false same-event grouping, but here we already have
    # a confirmed matched pair — we only need to determine orientation. The cap
    # is counterproductive when one bookmaker uses an abbreviated name like "KTE"
    # and another uses the full name "KTE Kecskemeti": the capped score (~35)
    # can fall below or near the cross-team score, hiding the reversal.
    rh = ref.home_team.lower().strip()
    ra = ref.away_team.lower().strip()
    oh = other.home_team.lower().strip()
    oa = other.away_team.lower().strip()

    s_same     = min(fuzz.token_set_ratio(rh, oh), fuzz.token_set_ratio(ra, oa))
    s_reversed = min(fuzz.token_set_ratio(rh, oa), fuzz.token_set_ratio(ra, oh))

    return s_reversed > s_same


def group_events_by_match(all_events: List[Event]) -> Dict[str, List[Event]]:
    """
    Group events from different bookmakers that represent the same match.
    Uses the normalized match_key first; falls back to fuzzy matching.

    Exact-key matches: allow one missing start time (lenient), but still block
    obviously incompatible leagues.
    Fuzzy matches: require both start times to be present and close, AND
    require league names to be compatible.  This prevents same-named clubs from
    different competitions being merged into a phantom arb opportunity.
    """
    groups: Dict[str, List[Event]] = {}

    for event in all_events:
        key = event.match_key
        placed = False

        # Try exact key match first
        if key in groups:
            ref = groups[key][0]
            if (_times_close(ref.starts_at, event.starts_at)
                    and _same_category(ref, event)
                    and _leagues_compatible(ref, event)):
                groups[key].append(event)
                placed = True

        if not placed:
            # Fuzzy fallback — stricter: both times must be known and close,
            # and leagues must be compatible
            for gkey, gevents in groups.items():
                ref = gevents[0]
                if (ref.sport == event.sport
                        and _times_close(ref.starts_at, event.starts_at,
                                         require_known=True)
                        and _same_category(ref, event)
                        and _leagues_compatible(ref, event)
                        and _teams_match(ref, event)):
                    gevents.append(event)
                    placed = True
                    break
        if not placed:
            groups[key] = [event]

    return groups


# ---------------------------------------------------------------------------
# Odds-inversion correction
# ---------------------------------------------------------------------------

def _fix_inverted_home_away(outcomes: List[Outcome]) -> List[Outcome]:
    """Detect and correct bookmakers whose Home/Away odds are swapped relative
    to the group consensus.

    Handles the case where a bookmaker's API has the correct team names but
    Win1→Away and Win2→Home (opposite to the standard 1X2 convention), producing
    a phantom arb.  Requires ≥3 bookmakers with both Home and Away odds.

    A bookmaker is flagged as inverted only when its "Home" odds are within 10%
    of the consensus "Away" AND its "Away" odds are within 10% of the consensus
    "Home".  This tight proximity check avoids false-positives where a bookmaker
    genuinely offers better (but not swapped) odds.
    """
    # Collect per-bookmaker Home and Away odds
    bm_home: Dict[str, float] = {}
    bm_away: Dict[str, float] = {}
    for o in outcomes:
        if o.name == 'Home':
            bm_home[o.bookmaker] = max(bm_home.get(o.bookmaker, 0), o.odds)
        elif o.name == 'Away':
            bm_away[o.bookmaker] = max(bm_away.get(o.bookmaker, 0), o.odds)

    both = {bm for bm in bm_home if bm in bm_away}
    if len(both) < 3:
        return outcomes   # need ≥3 for a reliable consensus

    inverted: set = set()
    for suspect in both:
        others = both - {suspect}
        cons_home = sum(bm_home[b] for b in others) / len(others)
        cons_away = sum(bm_away[b] for b in others) / len(others)

        # How well do the suspect's odds match consensus when swapped?
        # max(% deviation of suspect-Home vs cons-Away,
        #     % deviation of suspect-Away vs cons-Home)
        dev_home = abs(bm_home[suspect] - cons_away) / cons_away
        dev_away = abs(bm_away[suspect] - cons_home) / cons_home
        fit_swap = max(dev_home, dev_away)

        # How well do they match as-is?
        dev_home_same = abs(bm_home[suspect] - cons_home) / cons_home
        dev_away_same = abs(bm_away[suspect] - cons_away) / cons_away
        fit_same = max(dev_home_same, dev_away_same)

        # Only flag when swapped odds are very close to consensus (< 10%)
        # AND swapping is clearly a much better fit than the current labeling.
        if fit_swap < 0.10 and fit_swap < fit_same * 0.4:
            inverted.add(suspect)

    if not inverted:
        return outcomes

    logger.info(
        f'[Calculator] inverted Home/Away detected for {inverted} — swapping labels'
    )

    _SWAP = {'Home': 'Away', 'Away': 'Home'}
    result = []
    for o in outcomes:
        if o.bookmaker in inverted and o.name in _SWAP:
            result.append(Outcome(
                name=_SWAP[o.name], odds=o.odds,
                bookmaker=o.bookmaker, event_url=o.event_url, event_id=o.event_id,
            ))
        else:
            result.append(o)
    return result


# ---------------------------------------------------------------------------
# Correct Score score reversal
# ---------------------------------------------------------------------------

_CS_SCORE_RE = re.compile(r'^(\d{1,2})-(\d{1,2})$')
_CS_SWAP = {
    'Any Other Home Win': 'Any Other Away Win',
    'Any Other Away Win': 'Any Other Home Win',
}


def _reverse_cs_score(name: str) -> str:
    """Flip a correct score label when teams are listed in opposite order.

    "1-0" → "0-1" (home goals become away goals and vice versa).
    "Any Other Home Win" → "Any Other Away Win" and vice versa.
    Other labels (e.g. "Any Other Draw", "Any Other Score") are returned as-is.
    """
    if name in _CS_SWAP:
        return _CS_SWAP[name]
    m = _CS_SCORE_RE.match(name)
    if m:
        return f'{m.group(2)}-{m.group(1)}'
    return name


# ---------------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------------

_PLACEHOLDER_NAMES = frozenset([
    'home', 'away', 'home team', 'away team', 'home teams', 'away teams',
    'team 1', 'team 2', 'tbd', 'tba', 'winner', 'loser',
])


def _is_placeholder_team(name: str) -> bool:
    return name.strip().lower() in _PLACEHOLDER_NAMES


def find_arb_opportunities(events_by_bookmaker: Dict[str, List[Event]]) -> List[ArbOpportunity]:
    """Find all arbitrage opportunities across bookmakers."""
    cutoff = datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(hours=config.HOURS_AHEAD)
    all_events = [
        e for events in events_by_bookmaker.values() for e in events
        if (e.starts_at is None or e.starts_at.replace(tzinfo=None) <= cutoff)
        and not _is_placeholder_team(e.home_team)
        and not _is_placeholder_team(e.away_team)
    ]
    groups = group_events_by_match(all_events)

    opportunities: List[ArbOpportunity] = []

    for match_key, events in groups.items():
        # Need events from at least 2 different bookmakers
        bm_set = {e.bookmaker for e in events}
        if len(bm_set) < 2:
            continue

        # Collect all markets present
        markets = {e.market for e in events}

        for market in markets:
            market_events = [e for e in events if e.market == market]
            bm_in_market = {e.bookmaker for e in market_events}
            if len(bm_in_market) < 2:
                continue

            # Collect all outcomes, normalising Home/Away for reversed-team events.
            # Some bookmakers list a fixture as "B vs A" while others list "A vs B";
            # the match_key groups them together, but without correction "Away" from
            # one bookie would be compared with "Home" from the other — same real
            # team, creating a phantom arb. Swapping the labels for reversed events
            # makes all outcomes refer to the same team before we pick the best odds.
            _SWAP = {'Home': 'Away', 'Away': 'Home'}
            ref = market_events[0]

            normalized_outcomes: List[Outcome] = []
            for event in market_events:
                flipped = _is_teams_reversed(ref, event)
                for outcome in event.outcomes:
                    if flipped:
                        if outcome.name in _SWAP:
                            name = _SWAP[outcome.name]
                        else:
                            name = _reverse_cs_score(outcome.name)
                    else:
                        name = outcome.name
                    normalized_outcomes.append(Outcome(
                        name=name,
                        odds=outcome.odds,
                        bookmaker=outcome.bookmaker,
                        event_url=outcome.event_url,
                        event_id=event.event_id,   # carry scraper event_id through
                    ))

            normalized_outcomes = _fix_inverted_home_away(normalized_outcomes)

            outcome_names = {o.name for o in normalized_outcomes}

            # Best odds for each outcome (across all bookmakers)
            best: Dict[str, Outcome] = {}
            for outcome in normalized_outcomes:
                name = outcome.name
                if name not in best or outcome.odds > best[name].odds:
                    best[name] = outcome

            if len(best) != len(outcome_names):
                continue

            # Best odds must come from at least 2 different bookmakers
            if len({o.bookmaker for o in best.values()}) < 2:
                continue

            best_outcomes = list(best.values())

            # Sanity check: if the same bookmaker appears more than once in
            # best_outcomes, all its outcomes must come from the same event_id.
            # A divergence means two different matches from that bookmaker were
            # incorrectly merged by fuzzy matching — skip this phantom arb.
            bm_eids: Dict[str, set] = {}
            for o in best_outcomes:
                bm_eids.setdefault(o.bookmaker, set()).add(o.event_id)
            if any(len(eids) > 1 for eids in bm_eids.values()):
                conflicting = {bm: eids for bm, eids in bm_eids.items() if len(eids) > 1}
                logger.info(
                    f'[Calculator] Skipping phantom arb ({match_key} {market}): '
                    f'same-bookmaker event_id conflict — {conflicting}'
                )
                continue

            odds_vals = [o.odds for o in best_outcomes]
            pct = arb_margin(odds_vals)

            if pct is not None and config.MIN_ARB_PERCENTAGE <= pct <= config.MAX_ARB_PERCENTAGE:
                ref = market_events[0]
                stakes = optimal_stakes(100, best_outcomes)
                opportunities.append(ArbOpportunity(
                    sport=ref.sport,
                    event_name=f"{ref.home_team} vs {ref.away_team}",
                    league=ref.league,
                    market=market,
                    outcomes=stakes,
                    arb_percentage=pct,
                    profit_per_100=pct,
                    starts_at=ref.starts_at,
                ))

    opportunities.sort(key=lambda x: x.arb_percentage, reverse=True)
    logger.info(f"Arb scan complete: {len(opportunities)} opportunities found")
    return opportunities


# ---------------------------------------------------------------------------
# Middle detection
# ---------------------------------------------------------------------------

def _parse_ou_line(market: str) -> Optional[float]:
    """Extract line from 'Over/Under 2.5' → 2.5. Returns None for HT/2H variants."""
    m = re.match(r'^Over/Under\s+([\d.]+)$', market.strip())
    return float(m.group(1)) if m else None


def find_middle_opportunities(events_by_bookmaker: Dict[str, List[Event]]) -> List[MiddleOpportunity]:
    """Find Over/Under middle opportunities across bookmakers.

    A middle exists when Book A offers Over X and Book B offers Under Y for the
    same game, with X < Y. If the result lands between X and Y, both bets win.
    miss_pct < 0 means guaranteed profit even without the middle (better than arb).
    """
    cutoff = datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(hours=config.HOURS_AHEAD)
    all_events = [
        e for events in events_by_bookmaker.values() for e in events
        if (e.starts_at is None or e.starts_at.replace(tzinfo=None) <= cutoff)
        and not _is_placeholder_team(e.home_team)
        and not _is_placeholder_team(e.away_team)
    ]

    groups = group_events_by_match(all_events)
    middles: List[MiddleOpportunity] = []

    for match_key, events in groups.items():
        ref = events[0]

        # line → {outcome_name: (best_odds, bm, url, eid)}
        ou_by_line: Dict[float, Dict[str, tuple]] = {}

        for event in events:
            line = _parse_ou_line(event.market)
            if line is None:
                continue
            for outcome in event.outcomes:
                if outcome.name not in ('Over', 'Under'):
                    continue
                bucket = ou_by_line.setdefault(line, {})
                existing = bucket.get(outcome.name)
                if existing is None or outcome.odds > existing[0]:
                    bucket[outcome.name] = (outcome.odds, outcome.bookmaker,
                                             outcome.event_url, event.event_id)

        lines = sorted(ou_by_line.keys())

        for i, line_low in enumerate(lines):
            for line_high in lines[i + 1:]:
                window = line_high - line_low
                # Only integer-goal windows matter for football (0.5-step lines)
                if window < 1.0:
                    continue

                over_data  = ou_by_line[line_low].get('Over')   # best Over at lower line
                under_data = ou_by_line[line_high].get('Under')  # best Under at higher line
                if over_data is None or under_data is None:
                    continue

                over_odds, over_bm, over_url, over_eid = over_data
                under_odds, under_bm, under_url, under_eid = under_data

                # Require cross-book — same-book middles are arbitraged away by the platform
                if over_bm == under_bm:
                    continue
                if over_odds <= 1.0 or under_odds <= 1.0:
                    continue

                # harmonic captures the "quality" of the pair
                # miss_pct < 0 → guaranteed profit on every outcome (arb-middle)
                harmonic = over_odds * under_odds / (over_odds + under_odds)
                miss_pct = (1 - harmonic) * 100
                hit_pct  = (2 * harmonic - 1) * 100

                if miss_pct > 25:  # willing to risk at most 25% on a miss
                    continue

                middles.append(MiddleOpportunity(
                    sport      = ref.sport,
                    event_name = f'{ref.home_team} vs {ref.away_team}',
                    league     = ref.league,
                    over_line  = line_low,
                    under_line = line_high,
                    over_bm    = over_bm,
                    under_bm   = under_bm,
                    over_odds  = over_odds,
                    under_odds = under_odds,
                    over_url   = over_url or '',
                    under_url  = under_url or '',
                    over_eid   = over_eid,
                    under_eid  = under_eid,
                    window     = window,
                    miss_pct   = miss_pct,
                    hit_pct    = hit_pct,
                    starts_at  = ref.starts_at,
                ))

    middles.sort(key=lambda m: m.miss_pct)  # best (smallest loss risk) first
    logger.info(f"Middle scan complete: {len(middles)} opportunities found")
    return middles