001package org.biojava.nbio.genome.util; 002 003import com.google.common.collect.Range; 004 005import org.biojava.nbio.genome.parsers.genename.ChromPos; 006import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition; 007import org.slf4j.Logger; 008import org.slf4j.LoggerFactory; 009 010 011import java.io.StringWriter; 012import java.util.ArrayList; 013import java.util.List; 014 015/** 016 * A class that can map chromosomal positions to mRNA (coding sequence) positions. 017 * 018 * @author Andreas Prlic 019 */ 020 021public class ChromosomeMappingTools { 022 023 private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class); 024 025 026 private static final String newline = System.getProperty("line.separator"); 027 028 029 public static final String CHROMOSOME = "CHROMOSOME"; 030 public static final String CDS = "CDS"; 031 032 033 /** Pretty print the details of a GeneChromosomePosition to a String 034 * 035 * @param chromosomePosition 036 * @return 037 */ 038 public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){ 039 if ( chromosomePosition.getOrientation() == '+') 040 return formatExonStructureForward(chromosomePosition); 041 042 return formatExonStructureReverse(chromosomePosition); 043 044 } 045 046 047 private static String formatExonStructureForward(GeneChromosomePosition chromPos) { 048 049 StringWriter s = new StringWriter(); 050 051 List<Integer> exonStarts = chromPos.getExonStarts(); 052 List<Integer> exonEnds = chromPos.getExonEnds(); 053 054 int cdsStart = chromPos.getCdsStart(); 055 int cdsEnd = chromPos.getCdsEnd(); 056 057 boolean inCoding = false; 058 int codingLength = 0; 059 060 for (int i = 0; i < exonStarts.size(); i++) { 061 062 int start = exonStarts.get(i); 063 int end = exonEnds.get(i); 064 065 if (start <= cdsStart +1 && end >= cdsStart+1) { 066 067 inCoding = true; 068 codingLength += (end - cdsStart); 069 s.append(" UTR : ").append(format(start)).append(" - ").append(format(cdsStart)); 070 s.append(newline); 071 s.append(" -> Exon : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 072 s.append(newline); 073 074 } else if (start <= cdsEnd && end >= cdsEnd) { 075 //logger.debug(" <-- CDS end at: " + cdsEnd ); 076 inCoding = false; 077 codingLength += (cdsEnd - start); 078 079 s.append(" <- Exon : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 080 s.append(newline); 081 s.append(" UTR : " + (cdsEnd +1) + " - " + format(end)); 082 s.append(newline); 083 084 085 } else if (inCoding) { 086 // full exon is coding 087 codingLength += (end - start); 088 089 s.append(" Exon : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 090 s.append(newline); 091 } 092 093 } 094 s.append("Coding Length: "); 095 s.append((codingLength-3)+""); 096 s.append(newline); 097 return s.toString(); 098 } 099 100 101 102 private static String showGenePosLink(GeneChromosomePosition chromPos, Integer pos ) { 103 104 String spos = format(pos); 105 106 StringBuffer buf = new StringBuffer(); 107 buf.append("<a href=\"/pdb/gene/"); 108 buf.append(chromPos.getGeneName()); 109 buf.append("?chromosome="); 110 buf.append(chromPos.getChromosome()); 111 buf.append("&range="); 112 buf.append(pos); 113 buf.append("\">"); 114 buf.append(spos); 115 buf.append("</a>"); 116 117 return buf.toString(); 118 } 119 120 121 private static String formatExonStructureReverse(GeneChromosomePosition chromPos) { 122 StringWriter s = new StringWriter(); 123 124 List<Integer> exonStarts = chromPos.getExonStarts(); 125 List<Integer> exonEnds = chromPos.getExonEnds(); 126 127 128 int cdsStart = chromPos.getCdsStart(); 129 int cdsEnd = chromPos.getCdsEnd(); 130 131 // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd)); 132 133 boolean inCoding = false; 134 int codingLength = 0; 135 136 if (cdsEnd < cdsStart) { 137 int tmp = cdsEnd; 138 cdsEnd = cdsStart; 139 cdsStart = tmp; 140 } 141 142 // map reverse 143 for (int i = exonStarts.size() - 1; i >= 0; i--) { 144 145 int end = exonStarts.get(i); 146 int start = exonEnds.get(i); 147 148 if (end < start) { 149 int tmp = end; 150 end = start; 151 start = tmp; 152 } 153 154 if (start <= cdsEnd && end >= cdsEnd) { 155 inCoding = true; 156 157 int tmpstart = start; 158 if (start < cdsStart) { 159 tmpstart = cdsStart; 160 } 161 codingLength += (cdsEnd - tmpstart); 162 163 s.append(" UTR :" + format(cdsEnd + 1) + " | " + format(end)); 164 s.append(newline); 165 if (tmpstart == start) 166 s.append(" -> "); 167 else 168 s.append(" <-> "); 169 s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 170 s.append(newline); 171 // single exon with UTR on both ends 172 if (tmpstart != start) 173 s.append(" UTR :" + format(cdsStart ) + " - " + format(start + 1)); 174 s.append(newline); 175 176 } else if (start <= cdsStart && end >= cdsStart) { 177 inCoding = false; 178 codingLength += (end - cdsStart); 179 180 s.append(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 181 s.append(newline); 182 s.append(" UTR : " + format(start+1) + " - " + format(cdsStart )); 183 s.append(newline); 184 185 186 } else if (inCoding) { 187 // full exon is coding 188 codingLength += (end - start); 189 190 s.append(" Exon : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); 191 s.append(newline); 192 } else { 193 // e.g. see UBQLN3 194 s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end)); 195 s.append(newline); 196 } 197 } 198 199 s.append("CDS length: ").append(Integer.toString(codingLength - 3)); 200 s.append(newline); 201 202 return s.toString(); 203 } 204 205 /** 206 * Get the length of the CDS in nucleotides. 207 * 208 * @param chromPos 209 * @return length of the CDS in nucleotides. 210 */ 211 public static int getCDSLength(GeneChromosomePosition chromPos) { 212 213 logger.debug(chromPos.toString()); 214 215 logger.debug("chromosomal information: "); 216 217 logger.debug("Gene:" + chromPos.getGeneName()); 218 logger.debug(" Transcription (including UTRs): " + chromPos.getTranscriptionStart() + " - " + chromPos.getTranscriptionEnd() + " (length:" + (chromPos.getTranscriptionEnd() - chromPos.getTranscriptionStart()) + ")"); 219 logger.debug(" Orientation: " + chromPos.getOrientation()); 220 logger.debug(" CDS: " + (chromPos.getCdsStart()) + " - " + chromPos.getCdsEnd() + " (length: " + (chromPos.getCdsEnd() - chromPos.getCdsStart()) + ")"); 221 222 223 List<Integer> exonStarts = chromPos.getExonStarts(); 224 List<Integer> exonEnds = chromPos.getExonEnds(); 225 226 logger.debug("Exons:" + exonStarts.size()); 227 228 int cdsStart = chromPos.getCdsStart(); 229 int cdsEnd = chromPos.getCdsEnd(); 230 231 232 int codingLength; 233 if (chromPos.getOrientation().equals('+')) 234 codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd); 235 else 236 codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd); 237 return codingLength; 238 } 239 240 241 /** 242 * maps the position of a CDS nucleotide back to the genome 243 * 244 * @param cdsNucleotidePosition 245 * @return a ChromPos object 246 */ 247 public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) { 248 249 logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition); 250 251 List<Integer> exonStarts = chromPos.getExonStarts(); 252 List<Integer> exonEnds = chromPos.getExonEnds(); 253 254 logger.debug(" Exons:" + exonStarts.size()); 255 256 int cdsStart = chromPos.getCdsStart(); 257 int cdsEnd = chromPos.getCdsEnd(); 258 259 260 ChromPos chromosomePos = null; 261 262 if (chromPos.getOrientation().equals('+')) 263 264 chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); 265 else 266 chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); 267 268 logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at " + chromosomePos); 269 return chromosomePos; 270 271 } 272 273 /** returns a nicely formatted representation of the position 274 * 275 * @param chromosomePosition 276 * @return 277 */ 278 private static String format(int chromosomePosition){ 279 280 281 return String.format("%,d", chromosomePosition); 282 } 283 284 /** 285 * Get the CDS position mapped on the chromosome position 286 * 287 * @param exonStarts 288 * @param exonEnds 289 * @param cdsStart 290 * @param cdsEnd 291 * @return 292 */ 293 public static ChromPos getChromPosReverse(int cdsPos, List<Integer> exonStarts, 294 List<Integer> exonEnds, int cdsStart, int cdsEnd) { 295 296 boolean inCoding = false; 297 int codingLength = 0; 298 299 if (cdsEnd < cdsStart) { 300 int tmp = cdsEnd; 301 cdsEnd = cdsStart; 302 cdsStart = tmp; 303 } 304 305 int lengthExons = 0; 306 307 // map reverse 308 for (int i = exonStarts.size() - 1; i >= 0; i--) { 309 310 logger.debug("Exon #" + (i+1) + "/" + exonStarts.size()); 311 int end = exonStarts.get(i); 312 int start = exonEnds.get(i); 313 314 if (end < start) { 315 int tmp = end; 316 end = start; 317 start = tmp; 318 } 319 lengthExons += end - start; 320 321 logger.debug(" is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1)); 322 logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength); 323 324 if (start+1 <= cdsEnd && end >= cdsEnd) { 325 326 327 // FIRST EXON 328 inCoding = true; 329 330 331 int tmpstart = start; 332 if (start < cdsStart) { 333 tmpstart = cdsStart; 334 } 335 336 // here one of the few places where we don't say start+1 337 int check = codingLength + cdsEnd - tmpstart ; 338 339 logger.debug("First Exon | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos ); 340 341 342 if ( ( check > cdsPos) ) 343 { 344 345 int tmp = cdsPos - codingLength ; 346 347 348 logger.debug(" -> found position in UTR exon: " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength); 349 350 return new ChromPos((cdsEnd - tmp), -1) ; 351 } 352 353 354 // don't add 1 here 355 codingLength += (cdsEnd - tmpstart ); 356 357 boolean debug = logger.isDebugEnabled(); 358 359 360 if ( debug ) { 361 362 StringBuffer b = new StringBuffer(); 363 364 b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline); 365 if (tmpstart == start) 366 b.append(" -> "); 367 else 368 b.append(" <-> "); 369 b.append("Exon :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline); 370 371 // single exon with UTR on both ends 372 if (tmpstart != start) 373 b.append(" UTR :" + format(cdsStart) + " - " + format(start + 1) + newline); 374 375 logger.debug(b.toString()); 376 } 377 } else if (start <= cdsStart && end >= cdsStart) { 378 379 // LAST EXON 380 inCoding = false; 381 382 if (codingLength + end - cdsStart >= cdsPos) { 383 384 // how many remaining coding nucleotides? 385 int tmp = codingLength + end - cdsStart - cdsPos ; 386 387 logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart)); 388 389 logger.debug(" -> XXX found position noncoding exon: cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd)); 390 391 return new ChromPos((cdsStart + tmp),-1); 392 } 393 394 codingLength += (end - cdsStart); 395 396 logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3)); 397 logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); 398 399 } else if (inCoding) { 400 401 402 if (codingLength + end - start -1 >= cdsPos) { 403 404 int tmp = cdsPos - codingLength ; 405 406 if ( tmp > (end - start ) ) { 407 408 tmp = (end - start ); 409 410 logger.debug("changing tmp to " + tmp); 411 412 } 413 414 logger.debug(" " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp); 415 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); 416 logger.debug(" -> RRR found position coding exon: " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength); 417 418 return new ChromPos((end - tmp),cdsPos %3); 419 } 420 421 // full exon is coding 422 codingLength += (end - start) ; 423 424 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3)); 425 } else { 426 // e.g. see UBQLN3 427 428 logger.debug(" no translation!"); 429 } 430 431 432 logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos); 433 434 435 } 436 437 logger.debug("length exons: " + lengthExons); 438 // could not map, or map over the full length?? 439 return new ChromPos(-1,-1); 440 441 442 } 443 444 /** 445 * Get the CDS position mapped onto the chromosome position 446 * 447 * @param exonStarts 448 * @param exonEnds 449 * @param cdsStart 450 * @param cdsEnd 451 * @return 452 */ 453 public static ChromPos getChromPosForward(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, 454 int cdsStart, int cdsEnd) { 455 boolean inCoding = false; 456 int codingLength = 0; 457 458 int lengthExons = 0; 459 // map forward 460 for (int i = 0; i < exonStarts.size(); i++) { 461 462 463 // start can include UTR 464 int start = exonStarts.get(i); 465 int end = exonEnds.get(i); 466 467 lengthExons += end - start; 468 469 470 if (start <= cdsStart +1 && end >= cdsStart+1) { 471 // first exon with UTR 472 if (codingLength + (end - cdsStart-1) >= cdsPos) { 473 // we are reaching our target position 474 int tmp = cdsPos - codingLength; 475 476 477 logger.debug(cdsStart + " | " + codingLength + " | " + tmp); 478 logger.debug(" -> found position in UTR exon: #"+(i+1)+ " cdsPos:" + cdsPos + 479 " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); 480 481 // we start 1 after cdsStart... 482 return new ChromPos((cdsStart +1 + tmp),-1); 483 } 484 inCoding = true; 485 codingLength += (end - cdsStart); 486 487 logger.debug(" UTR : " + format(start+1) + " - " + (cdsStart )); 488 logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 489 490 } else if (start+1 <= cdsEnd && end >= cdsEnd) { 491 // LAST EXON with UTR 492 //logger.debug(" <-- CDS end at: " + cdsEnd ); 493 inCoding = false; 494 if (codingLength + (cdsEnd - start-1) >= cdsPos) { 495 int tmp = cdsPos - codingLength; 496 497 498 logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); 499 logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); 500 logger.debug( codingLength + " | " + tmp + " | " + format(start+1)); 501 logger.debug(" -> chromPosForward found position in non coding exon: " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength); 502 503 return new ChromPos((start +1 + tmp),cdsPos%3); 504 } 505 codingLength += (cdsEnd - start-1); 506 507 logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); 508 logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); 509 510 511 } else if (inCoding) { 512 // A standard coding Exon 513 // tests for the maximum length of this coding exon 514 if (codingLength + (end - start -1) >= cdsPos) { 515 516 // we are within the range of this exon 517 int tmp = cdsPos - codingLength ; 518 519 520 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength); 521 logger.debug(" -> found chr position in coding exon #" + (i+1) + ": cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength); 522 523 return new ChromPos((start +1 + tmp),cdsPos%3); 524 } 525 // full exon is coding 526 codingLength += (end - start ); 527 528 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); 529 } 530 // if ( inCoding ) 531 // logger.debug("exon phase at end:" + (codingLength % 3)); 532 // 533 // logger.debug(" coding length: " + codingLength); 534 535 536 } 537 538 //logger.debug("length exons: " + lengthExons); 539 //return codingLength - 3; 540 541 // could not map! 542 543 return new ChromPos(-1,-1); 544 } 545 546 547 /** 548 * Get the length of the coding sequence 549 * 550 * @param exonStarts 551 * @param exonEnds 552 * @param cdsStart 553 * @param cdsEnd 554 * @return 555 */ 556 public static int getCDSLengthReverse(List<Integer> exonStarts, 557 List<Integer> exonEnds, int cdsStart, int cdsEnd) { 558 559 boolean inCoding = false; 560 int codingLength = 0; 561 562 if (cdsEnd < cdsStart) { 563 int tmp = cdsEnd; 564 cdsEnd = cdsStart; 565 cdsStart = tmp; 566 } 567 568 int lengthExons = 0; 569 570 // map reverse 571 for (int i = exonStarts.size() - 1; i >= 0; i--) { 572 573 int end = exonStarts.get(i); 574 int start = exonEnds.get(i); 575 576 if (end < start) { 577 int tmp = end; 578 end = start; 579 start = tmp; 580 } 581 lengthExons += end - start; 582 583 if (start <= cdsEnd && end >= cdsEnd) { 584 inCoding = true; 585 586 587 int tmpstart = start; 588 if (start < cdsStart) { 589 tmpstart = cdsStart; 590 } 591 codingLength += (cdsEnd - tmpstart); 592 593 boolean debug = logger.isDebugEnabled(); 594 595 if ( debug) { 596 597 StringBuffer b = new StringBuffer(); 598 599 b.append(" UTR :" + (cdsEnd + 1) + " - " + (end) + newline); 600 if (tmpstart == start) 601 b.append(" -> "); 602 else 603 b.append(" <-> "); 604 b.append("Exon :" + tmpstart + " - " + cdsEnd + " | " + (cdsEnd - tmpstart) + " | " + codingLength + " | " + (codingLength % 3) + newline); 605 // single exon with UTR on both ends 606 if (tmpstart != start) 607 b.append(" UTR :" + (cdsStart - 1) + " - " + start + newline); 608 logger.debug(b.toString()); 609 610 } 611 } else if (start <= cdsStart && end >= cdsStart) { 612 inCoding = false; 613 codingLength += (end - cdsStart); 614 615 616 logger.debug(" <- Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart) + " | " + (codingLength-3) + " | " + (codingLength % 3)); 617 logger.debug(" UTR : " + start + " - " + (cdsStart )); 618 619 620 621 } else if (inCoding) { 622 // full exon is coding 623 codingLength += (end - start); 624 625 logger.debug(" Exon : " + start + " - " + end + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); 626 } else { 627 // e.g. see UBQLN3 628 629 logger.debug(" no translation!"); 630 } 631 632 } 633 634 logger.debug("length exons: " + lengthExons + " codin length: " + (codingLength - 3)); 635 return codingLength - 3; 636 } 637 638 /** 639 * Get the length of the coding sequence 640 * 641 * @param exonStarts 642 * @param exonEnds 643 * @param cdsStart 644 * @param cdsEnd 645 * @return 646 */ 647 public static int getCDSLengthForward(List<Integer> exonStarts, List<Integer> exonEnds, 648 int cdsStart, int cdsEnd) { 649 boolean inCoding = false; 650 int codingLength = 0; 651 652 int lengthExons = 0; 653 // map forward 654 for (int i = 0; i < exonStarts.size(); i++) { 655 656 int start = exonStarts.get(i); 657 int end = exonEnds.get(i); 658 lengthExons += end - start; 659 660 661 logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start)); 662 663 if (start+1 <= cdsStart +1 && end >= cdsStart+1) { 664 665 inCoding = true; 666 codingLength += (end - cdsStart); 667 668 logger.debug(" UTR : " + start + " - " + (cdsStart )); 669 logger.debug(" -> Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3)); 670 671 } else if (start+1 <= cdsEnd && end >= cdsEnd) { 672 673 inCoding = false; 674 codingLength += (cdsEnd - start); 675 676 logger.debug(" <- Exon : " + (start +1)+ " - " + cdsEnd + " | " + (cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3)); 677 logger.debug(" UTR : " + cdsEnd + 1 + " - " + end); 678 679 680 } else if (inCoding) { 681 // full exon is coding 682 codingLength += (end - start); 683 684 logger.debug(" Exon :" + (start+1) + " - " + end + " | " + (end - start+1) + " | " + codingLength + " | " + (codingLength % 3)); 685 } 686 687 688 } 689 690 logger.debug("length exons: " + Integer.toString(lengthExons)); 691 logger.debug("CDS length:" + Integer.toString((codingLength-3))); 692 693 return codingLength-3 ; 694 } 695 696 697 698 /** Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions) 699 * 700 * @param chromPos 701 * @return 702 */ 703 public static List<Range<Integer>> getCDSExonRanges(GeneChromosomePosition chromPos){ 704 if ( chromPos.getOrientation() == '+') 705 706 return getCDSExonRangesForward(chromPos,CDS); 707 708 return getCDSExonRangesReverse(chromPos,CDS); 709 } 710 711 712 /** Extracts the boundaries of the coding regions in chromosomal coordinates 713 * 714 * @param chromPos 715 * @return 716 */ 717 public static List<Range<Integer>> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){ 718 if ( chromPos.getOrientation() == '+') 719 return getCDSExonRangesForward(chromPos,CHROMOSOME); 720 721 return getCDSExonRangesReverse(chromPos,CHROMOSOME); 722 } 723 724 private static List<Range<Integer>> getCDSExonRangesReverse(GeneChromosomePosition chromPos, 725 String responseType) { 726 727 List<Integer> exonStarts = chromPos.getExonStarts(); 728 List<Integer> exonEnds = chromPos.getExonEnds(); 729 730 List<Range<Integer>> data = new ArrayList<>(); 731 int cdsStart = chromPos.getCdsStart(); 732 int cdsEnd = chromPos.getCdsEnd(); 733 734 boolean inCoding = false; 735 int codingLength = 0; 736 737 if (cdsEnd < cdsStart) { 738 int tmp = cdsEnd; 739 cdsEnd = cdsStart; 740 cdsStart = tmp; 741 } 742 743 java.lang.StringBuffer s =null; 744 745 boolean debug = logger.isDebugEnabled(); 746 747 748 if ( debug) 749 s = new StringBuffer(); 750 751 int lengthExons = 0; 752 753 // map reverse 754 for (int i = exonStarts.size() - 1; i >= 0; i--) { 755 756 int end = exonStarts.get(i); 757 int start = exonEnds.get(i); 758 759 if (end < start) { 760 int tmp = end; 761 end = start; 762 start = tmp; 763 } 764 lengthExons += end - start; 765 //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start)); 766 //s.append(newline); 767 768 if (start <= cdsEnd && end >= cdsEnd) { 769 inCoding = true; 770 771 772 int tmpstart = start; 773 if (start < cdsStart) { 774 tmpstart = cdsStart; 775 } 776 codingLength += (cdsEnd - tmpstart); 777 if ( debug ) { 778 s.append(" UTR :").append(format(cdsEnd + 1)).append(" | ").append(format(end)); 779 s.append(newline); 780 if (tmpstart == start) 781 s.append(" -> "); 782 else 783 s.append(" <-> "); 784 s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3); 785 s.append(newline); 786 // single exon with UTR on both ends 787 if (tmpstart != start) 788 s.append(" UTR :").append(format(cdsStart)).append(" - ").append(format(start + 1)); 789 s.append(newline); 790 } 791 792 793 Range r ; 794 if ( responseType.equals(CDS)) 795 r = Range.closed(0,codingLength); 796 else 797 r = Range.closed(tmpstart,cdsEnd); 798 799 data.add(r); 800 801 } else if (start <= cdsStart && end >= cdsStart) { 802 inCoding = false; 803 804 Range r; 805 if ( responseType.equals(CDS)) 806 r = Range.closed(codingLength,codingLength+(end-cdsStart)); 807 else 808 r = Range.closed(cdsStart+1,end); 809 810 data.add(r); 811 812 codingLength += (end - cdsStart); 813 if (debug) { 814 s.append(" <- Exon : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 815 s.append(newline); 816 s.append(" UTR : ").append(format(start + 1)).append(" - ").append(format(cdsStart)); 817 s.append(newline); 818 } 819 820 821 } else if (inCoding) { 822 // full exon is coding 823 824 Range r; 825 if ( responseType.equals(CDS)) 826 r = Range.closed(codingLength,codingLength+(end-start)); 827 else 828 r = Range.closed(start,end); 829 data.add(r); 830 831 codingLength += (end - start); 832 if (debug) { 833 s.append(" Exon : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); 834 s.append(newline); 835 } 836 } else { 837 // e.g. see UBQLN3 838 if ( debug ) { 839 s.append(" no translation! UTR: " + format(start) + " - " + format(end)); 840 s.append(newline); 841 } 842 } 843 } 844 if ( debug ) { 845 s.append("CDS length: ").append(Integer.toString(codingLength - 3)); 846 s.append(newline); 847 logger.debug(s.toString()); 848 } 849 850 return data; 851 } 852 853 854 855 private static List<Range<Integer>> getCDSExonRangesForward(GeneChromosomePosition chromPos, 856 String responseType) { 857 858 List<Range<Integer>> data = new ArrayList<>(); 859 List<Integer> exonStarts = chromPos.getExonStarts(); 860 List<Integer> exonEnds = chromPos.getExonEnds(); 861 862 863 int cdsStart = chromPos.getCdsStart(); 864 int cdsEnd = chromPos.getCdsEnd(); 865 866 boolean inCoding = false; 867 int codingLength = 0; 868 869 for (int i = 0; i < exonStarts.size(); i++) { 870 871 int start = exonStarts.get(i); 872 int end = exonEnds.get(i); 873 874 if (start <= cdsStart && end >= cdsStart) { 875 876 inCoding = true; 877 codingLength += (end - cdsStart); 878// 879 880 881 Range r; 882 if ( responseType.equals(CDS)) 883 r = Range.closed(0,codingLength); 884 else 885 r = Range.closed(cdsStart,end); 886 data.add(r); 887 888 } else if (start <= cdsEnd && end >= cdsEnd) { 889 //logger.debug(" <-- CDS end at: " + cdsEnd ); 890 inCoding = false; 891 892 893 Range r; 894 if ( responseType.equals(CDS)) 895 r = Range.closed(codingLength,codingLength+(cdsEnd-start)); 896 else 897 r = Range.closed(start,cdsEnd); 898 data.add(r); 899 codingLength += (cdsEnd - start); 900 901 } else if (inCoding) { 902 // full exon is coding 903 904 Range r; 905 if ( responseType.equals(CDS)) 906 r = Range.closed(codingLength,codingLength+(end-start)); 907 else 908 r = Range.closed(start,end); 909 data.add(r); 910 codingLength += (end - start); 911 912 } 913 914 } 915 916 return data; 917 } 918// 919 920 /** 921 * I have a genomic coordinate, where is it on the mRNA 922 * 923 * @param coordinate 924 * @param chromosomePosition 925 * @return 926 */ 927 public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) { 928 929 if ( chromosomePosition.getOrientation() == '+') 930 return getCDSPosForward(coordinate, 931 chromosomePosition.getExonStarts(), 932 chromosomePosition.getExonEnds(), 933 chromosomePosition.getCdsStart(), 934 chromosomePosition.getCdsEnd()); 935 936 return getCDSPosReverse(coordinate, 937 chromosomePosition.getExonStarts(), 938 chromosomePosition.getExonEnds(), 939 chromosomePosition.getCdsStart(), 940 chromosomePosition.getCdsEnd()); 941 } 942 943 /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 944 * living on the forward DNA strand. 945 * 946 * @param chromPos The genetic coordinate on a chromosome 947 * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 948 * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 949 * @param cdsStart The start position of a coding region 950 * @param cdsEnd The end position of a coding region 951 * 952 * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) 953 * 954 * @author Yana Valasatava 955 */ 956 public static int getCDSPosForward(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds, 957 int cdsStart, int cdsEnd) { 958 959 // the genetic coordinate is not in a coding region 960 if ( (chromPos < (cdsStart+1) ) || ( chromPos > (cdsEnd+1) ) ) { 961 logger.debug("The "+format(chromPos)+" position is not in a coding region"); 962 return -1; 963 } 964 965 logger.debug("looking for CDS position for " +format(chromPos)); 966 967 // remove exons that are fully landed in UTRs 968 List<Integer> tmpS = new ArrayList<Integer>(exonStarts); 969 List<Integer> tmpE = new ArrayList<Integer>(exonEnds); 970 971 int j=0; 972 for (int i = 0; i < tmpS.size(); i++) { 973 if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) { 974 exonStarts.remove(j); 975 exonEnds.remove(j); 976 } 977 else { 978 j++; 979 } 980 } 981 982 // remove untranslated regions from exons 983 int nExons = exonStarts.size(); 984 exonStarts.remove(0); 985 exonStarts.add(0, cdsStart); 986 exonEnds.remove(nExons-1); 987 exonEnds.add(cdsEnd); 988 989 int codingLength = 0; 990 int lengthExon = 0; 991 992 // map the genetic coordinate on a stretch of a reverse strand 993 for (int i = 0; i < nExons; i++) { 994 995 int start = exonStarts.get(i); 996 int end = exonEnds.get(i); 997 998 lengthExon = end - start; 999 1000 if (start+1 <= chromPos && end >= chromPos ) { 1001 return codingLength + (chromPos-start); 1002 } 1003 else { 1004 codingLength += lengthExon; 1005 } 1006 } 1007 return -1; 1008 } 1009 1010 /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 1011 * living on the reverse DNA strand. 1012 * 1013 * @param chromPos The genetic coordinate on a chromosome 1014 * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 1015 * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 1016 * @param cdsStart The start position of a coding region 1017 * @param cdsEnd The end position of a coding region 1018 * 1019 * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) 1020 * 1021 * @author Yana Valasatava 1022 */ 1023 public static int getCDSPosReverse(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds, 1024 int cdsStart, int cdsEnd) { 1025 1026 // the genetic coordinate is not in a coding region 1027 if ( (chromPos < (cdsStart+1)) || ( chromPos > (cdsEnd+1) ) ) { 1028 logger.debug("The "+format(chromPos)+" position is not in a coding region"); 1029 return -1; 1030 } 1031 1032 logger.debug("looking for CDS position for " +format(chromPos)); 1033 1034 // remove exons that are fully landed in UTRs 1035 List<Integer> tmpS = new ArrayList<Integer>(exonStarts); 1036 List<Integer> tmpE = new ArrayList<Integer>(exonEnds); 1037 1038 int j=0; 1039 for (int i = tmpS.size() - 1; i >= 0; i--) { 1040 if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) { 1041 exonStarts.remove(j); 1042 exonEnds.remove(j); 1043 } 1044 else { 1045 j++; 1046 } 1047 } 1048 1049 // remove untranslated regions from exons 1050 int nExons = exonStarts.size(); 1051 exonStarts.remove(0); 1052 exonStarts.add(0, cdsStart); 1053 exonEnds.remove(nExons-1); 1054 exonEnds.add(cdsEnd); 1055 1056 int codingLength = 0; 1057 int lengthExon = 0; 1058 1059 // map the genetic coordinate on a stretch of a reverse strand 1060 for (int i = nExons - 1; i >= 0; i--) { 1061 1062 int start = exonStarts.get(i); 1063 int end = exonEnds.get(i); 1064 1065 lengthExon = end - start; 1066 // +1 offset to be a base 1 1067 if (start+1 <= chromPos && end >= chromPos ) { 1068 return codingLength + (end-chromPos+1); 1069 } 1070 else { 1071 codingLength += lengthExon; 1072 } 1073 } 1074 return -1; 1075 } 1076}