/* gbdExample - illustrates accessing data from public UCSC
* database or a locally installed mirror or downloaded file */
#include "common.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "binRange.h"
#include "genePred.h"
#include "genePredReader.h"
#include "hdb.h"
/**********Globals************/
struct slDouble *overlapList = NULL;
struct slDouble *otherList = NULL;
/**********Globals************/
void usage()
/* Explain usage and exit. */
{
errAbort(
"gbdExample - find median length of introns overlapping ranges in input file\n"
"usage:\n"
" gbdExample db dbTable myBedFile method\n"
" where db is the database name \n"
" where dbTable is tableFileName in 'file' mode or else\n"
" name of table to use in 'public' or 'localDb' modes \n"
" where myBedFile is a bed file of genomic ranges \n"
" where method is either 'public' or 'localDb' or 'file' \n"
"\n");
}
/****************************************/
void binKeeperGpHashFree(struct hash **hash)
/* adapted from binKeeperPslHashFree in pslPseudo.c */
{
if (*hash != NULL)
{
struct hashEl *hashEl = NULL;
struct hashCookie cookie = hashFirst(*hash);
while ((hashEl = hashNext(&cookie)) != NULL)
{
struct binKeeper *bk = hashEl->val;
struct binElement *elist = NULL, *el = NULL;;
elist = binKeeperFindAll(bk) ;
for (el = elist; el != NULL ; el = el->next)
{
struct genePred *gp = el->val;
genePredFree(&gp);
}
binKeeperFree(&bk);
}
hashFree(hash);
}
}
/****************************************/
struct hash *readGpToBinKeeper(char *gpFileName)
/* adapted from readPslToBinKeeper in psl.c */
{
#define MAX_CHROM_SIZE 400000000
struct binKeeper *bk;
struct genePred *gp;
struct lineFile *pf = lineFileOpen(gpFileName , TRUE);
struct hash *hash = newHash(0);
char *row[21] ;
int genePredLineCtMin = 10;
while (lineFileNextRow(pf, row, genePredLineCtMin))
{
gp = genePredLoad(row);
if (hashLookup(hash, gp->chrom) == NULL)
{
bk = binKeeperNew(0, MAX_CHROM_SIZE);
hashAdd(hash, gp->chrom, bk);
}
bk = hashMustFindVal(hash, gp->chrom);
binKeeperAdd(bk, gp->txStart, gp->txEnd, gp);
}
lineFileClose(&pf);
return hash;
}
/****************************************/
struct genePred *bkToGenePreds(struct hash *gpHash, char *chrom, int start, int end)
/* */
{
struct genePred *gpList = NULL;
struct genePred *gp;
struct binKeeper *bk = hashFindVal(gpHash, chrom);
struct binElement *el, *elist = binKeeperFind(bk, start, end) ;
for (el = elist; el != NULL ; el = el->next)
{
gp = el->val;
if (gp != NULL)
{
slSafeAddHead(&gpList, gp);
}
}
slFreeList(&elist);
return gpList;
}
/****************************************/
struct sqlConnection *getHgdbtestConn(char *db)
/* Read .hg.conf and return connection. */
{
char *host = "genome-mysql.cse.ucsc.edu";
char *user = "genome";
char *password = NULL;
hSetDbConnect(host,db,user,password);
return sqlConnectRemote(host, user,password, db);
}
/****************************************/
int genePredLongestCmp(const void *va, const void *vb)
/* Compare to sort based sizes of txEnd - txStart, largest first. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int lengthA = a->txEnd - a->txStart;
int lengthB = b->txEnd - b->txStart;
int dif = lengthB - lengthA;
return dif;
}
/****************************************/
void intronLengthsForOneBed(struct bed *bed, struct sqlConnection *conn,
char *geneTable, struct hash *gpHash)
/* get intron statistics for longest gene that overlaps 'bed coordinates */
{
int bStart = bed->chromStart; int bEnd = bed->chromEnd;
int i, intronStart, intronEnd;
struct genePred *gp = NULL;
/* Create singly-linked list of gene prediction structures
* from either mySQL database or in-memory hash loaded from
* flat-file */
if (gpHash == NULL)
gp = genePredReaderLoadRangeQuery(conn, geneTable,
bed->chrom, bStart, bEnd, NULL);
else
gp = bkToGenePreds(gpHash, bed->chrom, bStart, bEnd);
if (gp == NULL)
{
warn("No gene found in %s overlapping %s:%d-%d, skipping\n",
geneTable, bed->chrom, bStart, bEnd);
return;
}
/* Sort the gene structure list by gene length and keep the
* structure for the longest gene */
slSort(&gp, genePredLongestCmp);
for (i=1; i< gp->exonCount; ++i)
{
/* For each intron in the gene, calculate its length. */
intronStart = gp->exonEnds[i - 1];
intronEnd = gp->exonStarts[i];
double intronLength = (double) (intronEnd - intronStart);
struct slDouble *slIntronLength = slDoubleNew(intronLength);
/* Then append the length to a list of the lengths of
* either the introns that overlap the region specified by the
* 'bed' coordinates or the introns that don't overlap the 'bed' */
if (positiveRangeIntersection(bStart, bEnd, intronStart, intronEnd))
slSafeAddHead(&overlapList, slIntronLength);
else
slSafeAddHead(&otherList, slIntronLength);
}
if (gpHash == NULL)
genePredFreeList(&gp);
}
/****************************************/
void processBedFile(char *bedFile, struct sqlConnection *conn,
char *geneTable, struct hash *gpHash)
/* Read file and process */
{
struct bed *bedList=NULL, *bed=NULL;
bedList = bedLoadAll(bedFile);
for(bed = bedList; bed != NULL; bed = bed->next)
{
intronLengthsForOneBed(bed, conn, geneTable, gpHash);
}
printf("Median value of lengths of overlapping introns = %f\n", slDoubleMedian(overlapList));
printf("Median value of lengths of other introns = %f\n", slDoubleMedian(otherList));
bedFreeList(&bedList);
}
/****************************************/
/* gbdExample.c */
int main(int argc, char *argv[])
/* Find median value of lengths of introns overlapping ranges in input file
* and compare with lengths of other introns in those genes
* Program reads 'bed file� of genomic regions and
* extracts longest gene overlapping each region. For each
* gene, lengths of introns overlapping the region as well
* as those not overlapping the region are computed. Medians
* of each set of intron lengths is printed out.
* Program is compiled with:
gcc -g -Wall -Werror -I${KENTSRC}/inc -I${KENTSRC}/hg/inc -o gbdExample gbdExample.c $KENTSRC/lib/$MACHTYPE/jkhgap.a $KENTSRC/lib/$MACHTYPE/jkweb.a $MYSQLLIBS lm
* where $KENTSRC is the local location of the kent source
* tree, $MYSQLLIBS is location of the local mySQL libraries
* and $MACHTYPE is the machine type environmental variable
* Once compiled and linked, the program is run as e.g.:
./gbdExample sacCer1 sgdGene myYeastBedFile localDb
* or
./gbdExample hg17 refGene myHumanBedFile public
* or
./gbdExample sacCer1 sgdGene.txt myYeastBedFile file
* where the first argument is the db to use (this parameter is
* ignored in 'file' mode, the second argument is the name of
* the db or file gene table, third program argument is the location
* file of (bed) locations to be screened for intron lengths,
* and the fourth argument indicates whether to use
* the public UCSC database at genome-mysql.cse.ucsc.edu
* or a locally installed mirror or a downloaded file.
*/
{
char *db = argv[1];
char *geneTable = argv[2];
char *bedFile = argv[3];
char *method = argv[4];
if (argc != 5)
usage();
struct sqlConnection *conn = NULL;
struct hash *gpHash = NULL;
if (sameWord(method, "file"))
gpHash = readGpToBinKeeper(geneTable);
else
{
conn = sameWord(method, "public") ?
getHgdbtestConn(db) : sqlConnect(db);
}
processBedFile(bedFile,conn, geneTable, gpHash);
slFreeList(&overlapList);
slFreeList(&otherList);
sqlDisconnect(&conn);
binKeeperGpHashFree(&gpHash);
return 0;
}