/* gbdExample - illustrates accessing data from public UCSC
 * database or a locally installed mirror or downloaded file */
#include "common.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "binRange.h"
#include "genePred.h"
#include "genePredReader.h"
#include "hdb.h"

/**********Globals************/
struct slDouble *overlapList = NULL;
struct slDouble *otherList = NULL;

/**********Globals************/
void usage()
/* Explain usage and exit. */
{
errAbort(
  "gbdExample - find median length of introns overlapping ranges in input file\n"
  "usage:\n"
  "   gbdExample db dbTable myBedFile method\n"
  "       where db is the database name \n"
  "       where dbTable is tableFileName in 'file' mode or else\n"
  "              name of table to use in 'public' or 'localDb' modes \n"
  "       where myBedFile is a bed file of genomic ranges \n"
  "       where method is either 'public' or 'localDb' or 'file' \n"
  "\n");
}

/****************************************/
void binKeeperGpHashFree(struct hash **hash)
/* adapted from binKeeperPslHashFree in pslPseudo.c  */
{
if (*hash != NULL)
    {
    struct hashEl *hashEl = NULL;
    struct hashCookie cookie = hashFirst(*hash);
    while ((hashEl = hashNext(&cookie)) != NULL)
        {
        struct binKeeper *bk = hashEl->val;
        struct binElement *elist = NULL, *el = NULL;;
        elist = binKeeperFindAll(bk) ;
        for (el = elist; el != NULL ; el = el->next)
            {
            struct genePred *gp = el->val;
            genePredFree(&gp);
            }
        binKeeperFree(&bk);
        }
    hashFree(hash);
    }
}

/****************************************/
struct hash *readGpToBinKeeper(char *gpFileName)
/* adapted from readPslToBinKeeper in psl.c */
{
#define MAX_CHROM_SIZE 400000000
struct binKeeper *bk; 
struct genePred *gp;
struct lineFile *pf = lineFileOpen(gpFileName , TRUE);
struct hash *hash = newHash(0);
char *row[21] ;
int genePredLineCtMin = 10;
while (lineFileNextRow(pf, row, genePredLineCtMin))
    {
    gp = genePredLoad(row);
    if (hashLookup(hash, gp->chrom) == NULL)
    	{
        bk = binKeeperNew(0, MAX_CHROM_SIZE);
        hashAdd(hash, gp->chrom, bk);    	
    	}
    bk = hashMustFindVal(hash, gp->chrom);
    binKeeperAdd(bk, gp->txStart, gp->txEnd, gp);
    }
lineFileClose(&pf);
return hash;
}

/****************************************/
struct genePred *bkToGenePreds(struct hash *gpHash, char *chrom, int start, int end)
/*  */
{
struct genePred *gpList = NULL;
struct genePred *gp;
struct binKeeper *bk = hashFindVal(gpHash, chrom);
struct binElement *el, *elist = binKeeperFind(bk, start, end) ;
for (el = elist; el != NULL ; el = el->next)
	{
	gp = el->val;
	if (gp != NULL)
		{
		slSafeAddHead(&gpList, gp);
		}
	}
slFreeList(&elist);
return gpList;
}

/****************************************/
struct sqlConnection *getHgdbtestConn(char *db) 
/* Read .hg.conf and return connection. */
{
char *host = "genome-mysql.cse.ucsc.edu";
char *user = "genome";
char *password = NULL;
hSetDbConnect(host,db,user,password);
return sqlConnectRemote(host, user,password, db);
}
/****************************************/
int genePredLongestCmp(const void *va, const void *vb)
/* Compare to sort based sizes of txEnd - txStart, largest first. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int lengthA = a->txEnd - a->txStart; 
int lengthB = b->txEnd - b->txStart;
int dif = lengthB - lengthA;
return dif;
}

/****************************************/
void intronLengthsForOneBed(struct bed *bed, struct sqlConnection *conn,
	char *geneTable, struct hash *gpHash)
/* get intron statistics for longest gene that overlaps 'bed coordinates */
{
int bStart = bed->chromStart; int bEnd = bed->chromEnd;
int i, intronStart, intronEnd;
struct genePred *gp = NULL;
/* Create singly-linked list of gene prediction structures
 * from either mySQL database or in-memory hash loaded from
 * flat-file */
if (gpHash == NULL)
	gp = genePredReaderLoadRangeQuery(conn, geneTable, 
						bed->chrom, bStart, bEnd, NULL);
else
	gp = bkToGenePreds(gpHash, bed->chrom, bStart, bEnd);
if (gp == NULL)
	{
	warn("No gene found in %s overlapping %s:%d-%d, skipping\n", 
		geneTable, bed->chrom, bStart, bEnd);
	return;
	}
/* Sort the gene structure list by gene length and keep the
 * structure for the longest gene */
slSort(&gp, genePredLongestCmp);
for (i=1; i< gp->exonCount; ++i)
	{
/* For each intron in the gene, calculate its length. */
	intronStart = gp->exonEnds[i - 1];
	intronEnd = gp->exonStarts[i];
	double intronLength = (double) (intronEnd - intronStart);
	struct slDouble *slIntronLength = slDoubleNew(intronLength);
 /* Then append the length to a list of the lengths of
  * either the introns that overlap the region specified by the 
  * 'bed' coordinates or the introns that don't overlap the 'bed' */
	if (positiveRangeIntersection(bStart, bEnd, intronStart, intronEnd))
		slSafeAddHead(&overlapList, slIntronLength);
	else
		slSafeAddHead(&otherList, slIntronLength);		
	}
if (gpHash == NULL)
	genePredFreeList(&gp);
}

/****************************************/
void processBedFile(char *bedFile, struct sqlConnection *conn, 
	char *geneTable, struct hash *gpHash)
/* Read file and process */
{
struct bed *bedList=NULL, *bed=NULL;
bedList = bedLoadAll(bedFile);
for(bed = bedList; bed != NULL; bed = bed->next)
	{
	intronLengthsForOneBed(bed, conn, geneTable, gpHash);
	}
printf("Median value of lengths of overlapping introns = %f\n", slDoubleMedian(overlapList));
printf("Median value of lengths of other introns = %f\n", slDoubleMedian(otherList));
bedFreeList(&bedList);
}

/****************************************/
/* gbdExample.c */
int main(int argc, char *argv[])
/* Find median value of lengths of introns overlapping ranges in input file 
 * and compare with lengths of other introns in those genes 
 * Program reads 'bed file� of genomic regions and 
 * extracts longest gene overlapping each region. For each
 * gene, lengths of introns overlapping the region as well
 * as those not overlapping the region are computed. Medians
 * of each set of intron lengths is printed out.
 * Program is compiled with:
gcc -g -Wall -Werror -I${KENTSRC}/inc -I${KENTSRC}/hg/inc -o gbdExample gbdExample.c $KENTSRC/lib/$MACHTYPE/jkhgap.a $KENTSRC/lib/$MACHTYPE/jkweb.a $MYSQLLIBS lm
 * where $KENTSRC is the local location of the kent source
 * tree, $MYSQLLIBS is location of the local mySQL libraries
 * and $MACHTYPE is the machine type environmental variable
 * Once compiled and linked, the program is run as e.g.:
./gbdExample sacCer1 sgdGene myYeastBedFile localDb
 * or
./gbdExample hg17 refGene myHumanBedFile public
* or
./gbdExample sacCer1 sgdGene.txt myYeastBedFile file
 * where the first argument is the db to use (this parameter is
 * ignored in 'file' mode, the second argument is the name of 
 * the db or file gene table, third program argument is the location 
 * file of (bed) locations to be screened for intron lengths, 
 * and the fourth argument indicates whether to use
 * the public UCSC database at genome-mysql.cse.ucsc.edu
 * or a locally installed mirror or a downloaded file.
 */
{
char *db = argv[1];
char *geneTable = argv[2];
char *bedFile = argv[3];
char *method = argv[4];
if (argc != 5)
    usage();
struct sqlConnection *conn = NULL;
struct hash *gpHash = NULL; 
if (sameWord(method, "file"))
	gpHash = readGpToBinKeeper(geneTable);
else
	{
	conn = sameWord(method, "public") ? 
  		getHgdbtestConn(db) : sqlConnect(db); 
  	}
processBedFile(bedFile,conn, geneTable, gpHash);
slFreeList(&overlapList);
slFreeList(&otherList);
sqlDisconnect(&conn);
binKeeperGpHashFree(&gpHash);
return 0;
}