import pandas as pd
import re
from pathlib import Path
from matplotlib.colors import LinearSegmentedColormap, Normalize

PROJ = Path("/nfshome/mgaran/mclab/SHARE/McIntyre_Lab/sex_specific_splicing")
TABLES_DIR   = PROJ / "Tables"
ANNO_DIR     = PROJ / "submission/supplementary/fiveSpecies_annotations"
DATAFILE_DIR = PROJ / "submission/supplementary/datafiles"
OUTPUT_DIR   = PROJ / "igv_browser"

SPECIES_GENOME_MAP = {
    'dmel': 'dmel6',
    'dsim': 'dsim2',
    'dser': 'dser1',
    'dsan': 'dsan1',
    'dyak': 'dyak2',
}

ANNOTATION_STATUSES = ['UJCanno', 'ERPanno', 'ERPnovel']

GTF_SOURCE = {
    'UJCanno':  'anno',
    'ERPanno':  'data',
    'ERPnovel': 'data',
}

colormap  = LinearSegmentedColormap.from_list('mf', ['blue', 'purple', 'red'])
normalize = Normalize(vmin=-9, vmax=9)

def effect_size_to_rgb_string(effect_size):
    if pd.isna(effect_size):
        return "180,180,180"
    clamped = max(-9, min(9, effect_size))
    r, g, b, _ = colormap(normalize(clamped))
    return f"{int(r*255)},{int(g*255)},{int(b*255)}"


def parse_gtf(gtf_file, target_jxnhashes):
    transcripts = {}
    with open(gtf_file) as fh:
        for line in fh:
            if line.startswith("#") or not line.strip():
                continue
            fields = line.strip().split("\t")
            if fields[2] != "exon":
                continue
            tid = re.search(r'transcript_id "([^"]+)"', fields[8])
            if not tid:
                continue
            tid = tid.group(1)
            if tid not in target_jxnhashes:
                continue
            transcripts.setdefault(tid, []).append((
                fields[0],
                int(fields[3]) - 1,
                int(fields[4]),
                fields[6],
            ))
    return transcripts


def build_bed12(transcripts, jxnhash_to_effect_size, jxnhash_to_geneid,
                jxnhash_to_male_reads, jxnhash_to_female_reads):
    records = []
    for tid, exons in transcripts.items():
        exons        = sorted(exons, key=lambda x: x[1])
        chrom        = exons[0][0]
        strand       = exons[0][3]
        tx_start     = exons[0][1]
        tx_end       = exons[-1][2]
        block_count  = len(exons)
        block_sizes  = ",".join(str(e[2] - e[1]) for e in exons)
        block_starts = ",".join(str(e[1] - tx_start) for e in exons)
        effect_size  = jxnhash_to_effect_size.get(tid, float("nan"))
        gene_id      = jxnhash_to_geneid.get(tid, "NA")
        male_reads   = jxnhash_to_male_reads.get(tid, 0)
        female_reads = jxnhash_to_female_reads.get(tid, 0)
        rgb_string   = effect_size_to_rgb_string(effect_size)
        es_str       = f"{effect_size:.3f}" if pd.notna(effect_size) else "NA"
        # label format: geneID|male_reads|female_reads|effect_size
        name = f"{gene_id}|M:{int(male_reads)}|F:{int(female_reads)}|ES:{es_str}"
        records.append({
            "chrom":       chrom,
            "start":       tx_start,
            "end":         tx_end,
            "name":        name,
            "score":       0,
            "strand":      strand,
            "thickStart":  tx_start,
            "thickEnd":    tx_end,
            "itemRgb":     rgb_string,
            "blockCount":  block_count,
            "blockSizes":  block_sizes,
            "blockStarts": block_starts,
        })
    return pd.DataFrame(records).sort_values(["chrom", "start"]) if records else pd.DataFrame()

for species, genome in SPECIES_GENOME_MAP.items():
    propReads_file = TABLES_DIR / f"UJC_propReads_table_bySex_from_datafile_jxnHash_w_flags_{species}.csv"
    anno_gtf       = ANNO_DIR / f"fiveSpecies_2_{genome}_anno_files/fiveSpecies_2_{genome}_ujc.gtf"
    data_gtf       = DATAFILE_DIR / f"{species}_data_2_{genome}_ujc_noMultiGene.gtf"

    df = pd.read_csv(propReads_file, low_memory=False)

    # sum all replicate rawCnts columns for male and female
    f_cols = [c for c in df.columns if c.startswith(f"rawCnts_{species}_F_rep")]
    m_cols = [c for c in df.columns if c.startswith(f"rawCnts_{species}_M_rep")]

    jxnhash_to_effect_size  = dict(zip(df["jxnHash"], df["effect_size_Equal"]))
    jxnhash_to_geneid       = dict(zip(df["jxnHash"], df["geneID"]))
    jxnhash_to_female_reads = dict(zip(df["jxnHash"], df[f_cols].sum(axis=1))) if f_cols else {}
    jxnhash_to_male_reads   = dict(zip(df["jxnHash"], df[m_cols].sum(axis=1))) if m_cols else {}

    for status in ANNOTATION_STATUSES:
        gtf_file = anno_gtf if GTF_SOURCE[status] == 'anno' else data_gtf

        subset      = df[df["annotation_status"] == status]
        target_jxns = set(subset["jxnHash"].dropna().unique())

        transcripts = parse_gtf(gtf_file, target_jxns)
        bed_df      = build_bed12(transcripts, jxnhash_to_effect_size, jxnhash_to_geneid,
                                  jxnhash_to_male_reads, jxnhash_to_female_reads)
        out_path    = OUTPUT_DIR / f"{species}_{status}_on_{genome}.bed"
        with open(out_path, "w") as fh:
            fh.write(f'track name="{species} {status}" itemRgb="On"\n')
            bed_df.to_csv(fh, sep="\t", header=False, index=False)