#!/usr/bin/env python3
"""
Read a dataframe with transcript or gene ids in the first column
and convert these to gene names, where possible.
Duplicate gene names are summed.
"""
from sys import argv

import genomepy
import pandas as pd


def convert(expression, gp):
    tid2name = gp.gtf_dict("transcript_id", "gene_name")
    tid2gid  = gp.gtf_dict("transcript_id", "gene_id")
    gid2name = gp.gtf_dict("gene_id", "gene_name")
    # rename genes to gene names (or gene id if names aren't available)
    expression = (
        expression.rename(index=tid2name)
        .rename(index=tid2gid)
        .rename(index=gid2name)
    )
    # merge duplicate genes
    expression = expression.groupby(by=expression.index).sum()
    return expression


if __name__ == '__main__': 
    expression = pd.read_table(argv[1], index_col=0)
    gp = genomepy.Annotation(argv[2])
    expression = convert(expression, gp)
    expression.to_csv(argv[3], sep="\t")
