import pandas as pd import re from pathlib import Path from matplotlib.colors import LinearSegmentedColormap, Normalize PROJ = Path("/nfshome/mgaran/mclab/SHARE/McIntyre_Lab/sex_specific_splicing") TABLES_DIR = PROJ / "Tables" ANNO_DIR = PROJ / "submission/supplementary/fiveSpecies_annotations" DATAFILE_DIR = PROJ / "submission/supplementary/datafiles" OUTPUT_DIR = PROJ / "igv_browser" SPECIES_GENOME_MAP = { 'dmel': 'dmel6', 'dsim': 'dsim2', 'dser': 'dser1', 'dsan': 'dsan1', 'dyak': 'dyak2', } ANNOTATION_STATUSES = ['UJCanno', 'ERPanno', 'ERPnovel'] GTF_SOURCE = { 'UJCanno': 'anno', 'ERPanno': 'data', 'ERPnovel': 'data', } colormap = LinearSegmentedColormap.from_list('mf', ['blue', 'purple', 'red']) normalize = Normalize(vmin=-9, vmax=9) def effect_size_to_rgb_string(effect_size): if pd.isna(effect_size): return "180,180,180" clamped = max(-9, min(9, effect_size)) r, g, b, _ = colormap(normalize(clamped)) return f"{int(r*255)},{int(g*255)},{int(b*255)}" def parse_gtf(gtf_file, target_jxnhashes): transcripts = {} with open(gtf_file) as fh: for line in fh: if line.startswith("#") or not line.strip(): continue fields = line.strip().split("\t") if fields[2] != "exon": continue tid = re.search(r'transcript_id "([^"]+)"', fields[8]) if not tid: continue tid = tid.group(1) if tid not in target_jxnhashes: continue transcripts.setdefault(tid, []).append(( fields[0], int(fields[3]) - 1, int(fields[4]), fields[6], )) return transcripts def build_bed12(transcripts, jxnhash_to_effect_size, jxnhash_to_geneid, jxnhash_to_male_reads, jxnhash_to_female_reads): records = [] for tid, exons in transcripts.items(): exons = sorted(exons, key=lambda x: x[1]) chrom = exons[0][0] strand = exons[0][3] tx_start = exons[0][1] tx_end = exons[-1][2] block_count = len(exons) block_sizes = ",".join(str(e[2] - e[1]) for e in exons) block_starts = ",".join(str(e[1] - tx_start) for e in exons) effect_size = jxnhash_to_effect_size.get(tid, float("nan")) gene_id = jxnhash_to_geneid.get(tid, "NA") male_reads = jxnhash_to_male_reads.get(tid, 0) female_reads = jxnhash_to_female_reads.get(tid, 0) rgb_string = effect_size_to_rgb_string(effect_size) es_str = f"{effect_size:.3f}" if pd.notna(effect_size) else "NA" # label format: geneID|male_reads|female_reads|effect_size name = f"{gene_id}|M:{int(male_reads)}|F:{int(female_reads)}|ES:{es_str}" records.append({ "chrom": chrom, "start": tx_start, "end": tx_end, "name": name, "score": 0, "strand": strand, "thickStart": tx_start, "thickEnd": tx_end, "itemRgb": rgb_string, "blockCount": block_count, "blockSizes": block_sizes, "blockStarts": block_starts, }) return pd.DataFrame(records).sort_values(["chrom", "start"]) if records else pd.DataFrame() for species, genome in SPECIES_GENOME_MAP.items(): propReads_file = TABLES_DIR / f"UJC_propReads_table_bySex_from_datafile_jxnHash_w_flags_{species}.csv" anno_gtf = ANNO_DIR / f"fiveSpecies_2_{genome}_anno_files/fiveSpecies_2_{genome}_ujc.gtf" data_gtf = DATAFILE_DIR / f"{species}_data_2_{genome}_ujc_noMultiGene.gtf" df = pd.read_csv(propReads_file, low_memory=False) # sum all replicate rawCnts columns for male and female f_cols = [c for c in df.columns if c.startswith(f"rawCnts_{species}_F_rep")] m_cols = [c for c in df.columns if c.startswith(f"rawCnts_{species}_M_rep")] jxnhash_to_effect_size = dict(zip(df["jxnHash"], df["effect_size_Equal"])) jxnhash_to_geneid = dict(zip(df["jxnHash"], df["geneID"])) jxnhash_to_female_reads = dict(zip(df["jxnHash"], df[f_cols].sum(axis=1))) if f_cols else {} jxnhash_to_male_reads = dict(zip(df["jxnHash"], df[m_cols].sum(axis=1))) if m_cols else {} for status in ANNOTATION_STATUSES: gtf_file = anno_gtf if GTF_SOURCE[status] == 'anno' else data_gtf subset = df[df["annotation_status"] == status] target_jxns = set(subset["jxnHash"].dropna().unique()) transcripts = parse_gtf(gtf_file, target_jxns) bed_df = build_bed12(transcripts, jxnhash_to_effect_size, jxnhash_to_geneid, jxnhash_to_male_reads, jxnhash_to_female_reads) out_path = OUTPUT_DIR / f"{species}_{status}_on_{genome}.bed" with open(out_path, "w") as fh: fh.write(f'track name="{species} {status}" itemRgb="On"\n') bed_df.to_csv(fh, sep="\t", header=False, index=False)