/* gbdExample - illustrates accessing data from public UCSC * database or a locally installed mirror or downloaded file */ #include "common.h" #include "options.h" #include "jksql.h" #include "bed.h" #include "binRange.h" #include "genePred.h" #include "genePredReader.h" #include "hdb.h" /**********Globals************/ struct slDouble *overlapList = NULL; struct slDouble *otherList = NULL; /**********Globals************/ void usage() /* Explain usage and exit. */ { errAbort( "gbdExample - find median length of introns overlapping ranges in input file\n" "usage:\n" " gbdExample db dbTable myBedFile method\n" " where db is the database name \n" " where dbTable is tableFileName in 'file' mode or else\n" " name of table to use in 'public' or 'localDb' modes \n" " where myBedFile is a bed file of genomic ranges \n" " where method is either 'public' or 'localDb' or 'file' \n" "\n"); } /****************************************/ void binKeeperGpHashFree(struct hash **hash) /* adapted from binKeeperPslHashFree in pslPseudo.c */ { if (*hash != NULL) { struct hashEl *hashEl = NULL; struct hashCookie cookie = hashFirst(*hash); while ((hashEl = hashNext(&cookie)) != NULL) { struct binKeeper *bk = hashEl->val; struct binElement *elist = NULL, *el = NULL;; elist = binKeeperFindAll(bk) ; for (el = elist; el != NULL ; el = el->next) { struct genePred *gp = el->val; genePredFree(&gp); } binKeeperFree(&bk); } hashFree(hash); } } /****************************************/ struct hash *readGpToBinKeeper(char *gpFileName) /* adapted from readPslToBinKeeper in psl.c */ { #define MAX_CHROM_SIZE 400000000 struct binKeeper *bk; struct genePred *gp; struct lineFile *pf = lineFileOpen(gpFileName , TRUE); struct hash *hash = newHash(0); char *row[21] ; int genePredLineCtMin = 10; while (lineFileNextRow(pf, row, genePredLineCtMin)) { gp = genePredLoad(row); if (hashLookup(hash, gp->chrom) == NULL) { bk = binKeeperNew(0, MAX_CHROM_SIZE); hashAdd(hash, gp->chrom, bk); } bk = hashMustFindVal(hash, gp->chrom); binKeeperAdd(bk, gp->txStart, gp->txEnd, gp); } lineFileClose(&pf); return hash; } /****************************************/ struct genePred *bkToGenePreds(struct hash *gpHash, char *chrom, int start, int end) /* */ { struct genePred *gpList = NULL; struct genePred *gp; struct binKeeper *bk = hashFindVal(gpHash, chrom); struct binElement *el, *elist = binKeeperFind(bk, start, end) ; for (el = elist; el != NULL ; el = el->next) { gp = el->val; if (gp != NULL) { slSafeAddHead(&gpList, gp); } } slFreeList(&elist); return gpList; } /****************************************/ struct sqlConnection *getHgdbtestConn(char *db) /* Read .hg.conf and return connection. */ { char *host = "genome-mysql.cse.ucsc.edu"; char *user = "genome"; char *password = NULL; hSetDbConnect(host,db,user,password); return sqlConnectRemote(host, user,password, db); } /****************************************/ int genePredLongestCmp(const void *va, const void *vb) /* Compare to sort based sizes of txEnd - txStart, largest first. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); int lengthA = a->txEnd - a->txStart; int lengthB = b->txEnd - b->txStart; int dif = lengthB - lengthA; return dif; } /****************************************/ void intronLengthsForOneBed(struct bed *bed, struct sqlConnection *conn, char *geneTable, struct hash *gpHash) /* get intron statistics for longest gene that overlaps 'bed coordinates */ { int bStart = bed->chromStart; int bEnd = bed->chromEnd; int i, intronStart, intronEnd; struct genePred *gp = NULL; /* Create singly-linked list of gene prediction structures * from either mySQL database or in-memory hash loaded from * flat-file */ if (gpHash == NULL) gp = genePredReaderLoadRangeQuery(conn, geneTable, bed->chrom, bStart, bEnd, NULL); else gp = bkToGenePreds(gpHash, bed->chrom, bStart, bEnd); if (gp == NULL) { warn("No gene found in %s overlapping %s:%d-%d, skipping\n", geneTable, bed->chrom, bStart, bEnd); return; } /* Sort the gene structure list by gene length and keep the * structure for the longest gene */ slSort(&gp, genePredLongestCmp); for (i=1; i< gp->exonCount; ++i) { /* For each intron in the gene, calculate its length. */ intronStart = gp->exonEnds[i - 1]; intronEnd = gp->exonStarts[i]; double intronLength = (double) (intronEnd - intronStart); struct slDouble *slIntronLength = slDoubleNew(intronLength); /* Then append the length to a list of the lengths of * either the introns that overlap the region specified by the * 'bed' coordinates or the introns that don't overlap the 'bed' */ if (positiveRangeIntersection(bStart, bEnd, intronStart, intronEnd)) slSafeAddHead(&overlapList, slIntronLength); else slSafeAddHead(&otherList, slIntronLength); } if (gpHash == NULL) genePredFreeList(&gp); } /****************************************/ void processBedFile(char *bedFile, struct sqlConnection *conn, char *geneTable, struct hash *gpHash) /* Read file and process */ { struct bed *bedList=NULL, *bed=NULL; bedList = bedLoadAll(bedFile); for(bed = bedList; bed != NULL; bed = bed->next) { intronLengthsForOneBed(bed, conn, geneTable, gpHash); } printf("Median value of lengths of overlapping introns = %f\n", slDoubleMedian(overlapList)); printf("Median value of lengths of other introns = %f\n", slDoubleMedian(otherList)); bedFreeList(&bedList); } /****************************************/ /* gbdExample.c */ int main(int argc, char *argv[]) /* Find median value of lengths of introns overlapping ranges in input file * and compare with lengths of other introns in those genes * Program reads 'bed fileÍ of genomic regions and * extracts longest gene overlapping each region. For each * gene, lengths of introns overlapping the region as well * as those not overlapping the region are computed. Medians * of each set of intron lengths is printed out. * Program is compiled with: gcc -g -Wall -Werror -I${KENTSRC}/inc -I${KENTSRC}/hg/inc -o gbdExample gbdExample.c $KENTSRC/lib/$MACHTYPE/jkhgap.a $KENTSRC/lib/$MACHTYPE/jkweb.a $MYSQLLIBS lm * where $KENTSRC is the local location of the kent source * tree, $MYSQLLIBS is location of the local mySQL libraries * and $MACHTYPE is the machine type environmental variable * Once compiled and linked, the program is run as e.g.: ./gbdExample sacCer1 sgdGene myYeastBedFile localDb * or ./gbdExample hg17 refGene myHumanBedFile public * or ./gbdExample sacCer1 sgdGene.txt myYeastBedFile file * where the first argument is the db to use (this parameter is * ignored in 'file' mode, the second argument is the name of * the db or file gene table, third program argument is the location * file of (bed) locations to be screened for intron lengths, * and the fourth argument indicates whether to use * the public UCSC database at genome-mysql.cse.ucsc.edu * or a locally installed mirror or a downloaded file. */ { char *db = argv[1]; char *geneTable = argv[2]; char *bedFile = argv[3]; char *method = argv[4]; if (argc != 5) usage(); struct sqlConnection *conn = NULL; struct hash *gpHash = NULL; if (sameWord(method, "file")) gpHash = readGpToBinKeeper(geneTable); else { conn = sameWord(method, "public") ? getHgdbtestConn(db) : sqlConnect(db); } processBedFile(bedFile,conn, geneTable, gpHash); slFreeList(&overlapList); slFreeList(&otherList); sqlDisconnect(&conn); binKeeperGpHashFree(&gpHash); return 0; }