001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.util; 022 023import com.google.common.collect.Range; 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 026import org.biojava.nbio.core.sequence.template.SequenceView; 027import org.biojava.nbio.genome.parsers.genename.ChromPos; 028import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition; 029import org.biojava.nbio.genome.parsers.twobit.TwoBitFacade; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import java.io.StringWriter; 034import java.util.ArrayList; 035import java.util.List; 036 037/** 038 * A class that can map chromosomal positions to mRNA (coding sequence) positions. 039 * 040 * @author Andreas Prlic 041 */ 042 043public class ChromosomeMappingTools { 044 045 private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class); 046 047 private static final String newline = System.getProperty("line.separator"); 048 049 public static final String CHROMOSOME = "CHROMOSOME"; 050 public static final String CDS = "CDS"; 051 052 private static int base = 1; 053 public static void setCoordinateSystem(int baseInt) { 054 base = baseInt; 055 } 056 057 /** 058 * Pretty print the details of a GeneChromosomePosition to a String 059 * 060 * @param chromosomePosition 061 * @return 062 */ 063 public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){ 064 if ( chromosomePosition.getOrientation() == '+') 065 return formatExonStructureForward(chromosomePosition); 066 return formatExonStructureReverse(chromosomePosition); 067 } 068 069 private static String formatExonStructureForward(GeneChromosomePosition chromPos) { 070 071 StringWriter s = new StringWriter(); 072 073 List<Integer> exonStarts = chromPos.getExonStarts(); 074 List<Integer> exonEnds = chromPos.getExonEnds(); 075 076 int cdsStart = chromPos.getCdsStart(); 077 int cdsEnd = chromPos.getCdsEnd(); 078 079 boolean inCoding = false; 080 int codingLength = 0; 081 082 for (int i = 0; i < exonStarts.size(); i++) { 083 084 int start = exonStarts.get(i); 085 int end = exonEnds.get(i); 086 087 if (start <= cdsStart +1 && end >= cdsStart+1) { 088 089 inCoding = true; 090 codingLength += (end - cdsStart); 091 s.append(" UTR : ").append(format(start)).append(" - ").append(format(cdsStart)); 092 s.append(newline); 093 s.append(" -> Exon : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 094 s.append(newline); 095 096 } else if (start <= cdsEnd && end >= cdsEnd) { 097 //logger.debug(" <-- CDS end at: " + cdsEnd ); 098 inCoding = false; 099 codingLength += (cdsEnd - start); 100 101 s.append(" <- Exon : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 102 s.append(newline); 103 s.append(" UTR : " + (cdsEnd +1) + " - " + format(end)); 104 s.append(newline); 105 106 } else if (inCoding) { 107 // full exon is coding 108 codingLength += (end - start); 109 110 s.append(" Exon : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 111 s.append(newline); 112 } 113 } 114 s.append("Coding Length: "); 115 s.append((codingLength-3)+""); 116 s.append(newline); 117 return s.toString(); 118 } 119 120 private static String formatExonStructureReverse(GeneChromosomePosition chromPos) { 121 StringWriter s = new StringWriter(); 122 123 List<Integer> exonStarts = chromPos.getExonStarts(); 124 List<Integer> exonEnds = chromPos.getExonEnds(); 125 126 127 int cdsStart = chromPos.getCdsStart(); 128 int cdsEnd = chromPos.getCdsEnd(); 129 130 // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd)); 131 132 boolean inCoding = false; 133 int codingLength = 0; 134 135 if (cdsEnd < cdsStart) { 136 int tmp = cdsEnd; 137 cdsEnd = cdsStart; 138 cdsStart = tmp; 139 } 140 141 // map reverse 142 for (int i = exonStarts.size() - 1; i >= 0; i--) { 143 144 int end = exonStarts.get(i); 145 int start = exonEnds.get(i); 146 147 if (end < start) { 148 int tmp = end; 149 end = start; 150 start = tmp; 151 } 152 153 if (start <= cdsEnd && end >= cdsEnd) { 154 inCoding = true; 155 156 int tmpstart = start; 157 if (start < cdsStart) { 158 tmpstart = cdsStart; 159 } 160 codingLength += (cdsEnd - tmpstart); 161 162 s.append(" UTR :" + format(cdsEnd + 1) + " | " + format(end)); 163 s.append(newline); 164 if (tmpstart == start) 165 s.append(" -> "); 166 else 167 s.append(" <-> "); 168 s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); 169 s.append(newline); 170 // single exon with UTR on both ends 171 if (tmpstart != start) 172 s.append(" UTR :" + format(cdsStart ) + " - " + format(start + 1)); 173 s.append(newline); 174 175 } else if (start <= cdsStart && end >= cdsStart) { 176 inCoding = false; 177 codingLength += (end - cdsStart); 178 179 s.append(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 180 s.append(newline); 181 s.append(" UTR : " + format(start+1) + " - " + format(cdsStart )); 182 s.append(newline); 183 184 185 } else if (inCoding) { 186 // full exon is coding 187 codingLength += (end - start); 188 189 s.append(" Exon : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); 190 s.append(newline); 191 } else { 192 // e.g. see UBQLN3 193 s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end)); 194 s.append(newline); 195 } 196 } 197 198 s.append("CDS length: ").append(Integer.toString(codingLength - 3)); 199 s.append(newline); 200 201 return s.toString(); 202 } 203 204 /** 205 * Get the length of the CDS in nucleotides. 206 * 207 * @param chromPos 208 * @return length of the CDS in nucleotides. 209 */ 210 public static int getCDSLength(GeneChromosomePosition chromPos) { 211 212 List<Integer> exonStarts = chromPos.getExonStarts(); 213 List<Integer> exonEnds = chromPos.getExonEnds(); 214 215 int cdsStart = chromPos.getCdsStart(); 216 int cdsEnd = chromPos.getCdsEnd(); 217 218 int codingLength; 219 if (chromPos.getOrientation().equals('+')) 220 codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd); 221 else 222 codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd); 223 return codingLength; 224 } 225 226 /** 227 * Maps the position of a CDS nucleotide back to the genome 228 * 229 * @param cdsNucleotidePosition 230 * @return a ChromPos object 231 */ 232 public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) { 233 234 logger.debug(" ? Checking chromosome position for CDS position {}", cdsNucleotidePosition); 235 236 List<Integer> exonStarts = chromPos.getExonStarts(); 237 List<Integer> exonEnds = chromPos.getExonEnds(); 238 239 logger.debug(" Exons:{}", exonStarts.size()); 240 241 int cdsStart = chromPos.getCdsStart(); 242 int cdsEnd = chromPos.getCdsEnd(); 243 244 245 ChromPos chromosomePos = null; 246 247 if (chromPos.getOrientation().equals('+')) 248 249 chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); 250 else 251 chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); 252 253 logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at " + chromosomePos); 254 return chromosomePos; 255 256 } 257 258 /** 259 * Returns a nicely formatted representation of the position 260 * 261 * @param chromosomePosition 262 * @return 263 */ 264 private static String format(int chromosomePosition){ 265 return String.format("%,d", chromosomePosition); 266 } 267 268 /** 269 * Get the CDS position mapped on the chromosome position 270 * 271 * @param exonStarts 272 * @param exonEnds 273 * @param cdsStart 274 * @param cdsEnd 275 * @return 276 */ 277 public static ChromPos getChromPosReverse(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) { 278 279 boolean inCoding = false; 280 int codingLength = 0; 281 282 if (cdsEnd < cdsStart) { 283 int tmp = cdsEnd; 284 cdsEnd = cdsStart; 285 cdsStart = tmp; 286 } 287 288 int lengthExons = 0; 289 290 // map reverse 291 for (int i = exonStarts.size() - 1; i >= 0; i--) { 292 293 logger.debug("Exon #" + (i+1) + "/" + exonStarts.size()); 294 int end = exonStarts.get(i); 295 int start = exonEnds.get(i); 296 297 if (end < start) { 298 int tmp = end; 299 end = start; 300 start = tmp; 301 } 302 lengthExons += end - start; 303 304 logger.debug(" is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1)); 305 logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength); 306 307 if (start+1 <= cdsEnd && end >= cdsEnd) { 308 309 // FIRST EXON 310 inCoding = true; 311 312 int tmpstart = start; 313 if (start < cdsStart) { 314 tmpstart = cdsStart; 315 } 316 317 // here one of the few places where we don't say start+1 318 int check = codingLength + cdsEnd - tmpstart ; 319 320 logger.debug("First Exon | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos ); 321 322 323 if ( ( check > cdsPos) ) { 324 int tmp = cdsPos - codingLength ; 325 logger.debug(" -> found position in UTR exon: " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength); 326 return new ChromPos((cdsEnd - tmp), -1) ; 327 } 328 329 // don't add 1 here 330 codingLength += (cdsEnd - tmpstart ); 331 332 boolean debug = logger.isDebugEnabled(); 333 334 if ( debug ) { 335 336 StringBuffer b = new StringBuffer(); 337 338 b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline); 339 if (tmpstart == start) 340 b.append(" -> "); 341 else 342 b.append(" <-> "); 343 b.append("Exon :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline); 344 345 // single exon with UTR on both ends 346 if (tmpstart != start) 347 b.append(" UTR :" + format(cdsStart) + " - " + format(start + 1) + newline); 348 349 logger.debug(b.toString()); 350 } 351 } else if (start <= cdsStart && end >= cdsStart) { 352 353 // LAST EXON 354 inCoding = false; 355 356 if (codingLength + end - cdsStart >= cdsPos) { 357 358 // how many remaining coding nucleotides? 359 int tmp = codingLength + end - cdsStart - cdsPos ; 360 361 logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart)); 362 363 logger.debug(" -> XXX found position noncoding exon: cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd)); 364 365 return new ChromPos((cdsStart + tmp),-1); 366 } 367 368 codingLength += (end - cdsStart); 369 370 logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3)); 371 logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); 372 373 } else if (inCoding) { 374 375 if (codingLength + end - start -1 >= cdsPos) { 376 377 int tmp = cdsPos - codingLength ; 378 379 if ( tmp > (end - start ) ) { 380 tmp = (end - start ); 381 logger.debug("changing tmp to {}", tmp); 382 } 383 logger.debug(" " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp); 384 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); 385 logger.debug(" -> RRR found position coding exon: " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength); 386 387 return new ChromPos((end - tmp),cdsPos %3); 388 } 389 // full exon is coding 390 codingLength += (end - start) ; 391 392 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3)); 393 } else { 394 // e.g. see UBQLN3 395 logger.debug(" no translation!"); 396 } 397 logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos); 398 } 399 400 logger.debug("length exons: {}", lengthExons); 401 // could not map, or map over the full length?? 402 return new ChromPos(-1,-1); 403 404 } 405 406 /** 407 * Get the CDS position mapped onto the chromosome position 408 * 409 * @param exonStarts 410 * @param exonEnds 411 * @param cdsStart 412 * @param cdsEnd 413 * @return 414 */ 415 public static ChromPos getChromPosForward(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) { 416 boolean inCoding = false; 417 int codingLength = 0; 418 419 @SuppressWarnings("unused") 420 int lengthExons = 0; 421 // map forward 422 for (int i = 0; i < exonStarts.size(); i++) { 423 424 // start can include UTR 425 int start = exonStarts.get(i); 426 int end = exonEnds.get(i); 427 428 lengthExons += end - start; 429 430 if (start <= cdsStart +1 && end >= cdsStart+1) { 431 // first exon with UTR 432 if (codingLength + (end - cdsStart-1) >= cdsPos) { 433 // we are reaching our target position 434 int tmp = cdsPos - codingLength; 435 436 437 logger.debug(cdsStart + " | " + codingLength + " | " + tmp); 438 logger.debug(" -> found position in UTR exon: #"+(i+1)+ " cdsPos:" + cdsPos + 439 " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); 440 441 // we start 1 after cdsStart... 442 return new ChromPos((cdsStart +1 + tmp),-1); 443 } 444 inCoding = true; 445 codingLength += (end - cdsStart); 446 447 logger.debug(" UTR : " + format(start+1) + " - " + (cdsStart )); 448 logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 449 450 } else if (start+1 <= cdsEnd && end >= cdsEnd) { 451 // LAST EXON with UTR 452 //logger.debug(" <-- CDS end at: " + cdsEnd ); 453 inCoding = false; 454 if (codingLength + (cdsEnd - start-1) >= cdsPos) { 455 int tmp = cdsPos - codingLength; 456 457 logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); 458 logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); 459 logger.debug( codingLength + " | " + tmp + " | " + format(start+1)); 460 logger.debug(" -> chromPosForward found position in non coding exon: " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength); 461 462 return new ChromPos((start +1 + tmp),cdsPos%3); 463 } 464 codingLength += (cdsEnd - start-1); 465 466 logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); 467 logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); 468 469 470 } else if (inCoding) { 471 // A standard coding Exon 472 // tests for the maximum length of this coding exon 473 if (codingLength + (end - start -1) >= cdsPos) { 474 475 // we are within the range of this exon 476 int tmp = cdsPos - codingLength ; 477 478 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength); 479 logger.debug(" -> found chr position in coding exon #" + (i+1) + ": cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength); 480 481 return new ChromPos((start +1 + tmp),cdsPos%3); 482 } 483 // full exon is coding 484 codingLength += (end - start ); 485 486 logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); 487 } 488 } 489 return new ChromPos(-1,-1); 490 } 491 492 /** 493 * Get the length of the coding sequence 494 * 495 * @param exonStarts 496 * @param exonEnds 497 * @param cdsStart 498 * @param cdsEnd 499 * @return 500 */ 501 public static int getCDSLengthReverse(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) { 502 503 int codingLength = 0; 504 505 if (cdsEnd < cdsStart) { 506 int tmp = cdsEnd; 507 cdsEnd = cdsStart; 508 cdsStart = tmp; 509 } 510 cdsStart = cdsStart + base; 511 512 // map reverse 513 for (int i = exonStarts.size() - 1; i >= 0; i--) { 514 515 int end = exonStarts.get(i); 516 int start = exonEnds.get(i); 517 518 if (end < start) { 519 int tmp = end; 520 end = start; 521 start = tmp; 522 } 523 start = start + base; 524 525 if ((start < cdsStart && end < cdsStart) || (start > cdsEnd && end > cdsEnd)) 526 continue; 527 528 if (start < cdsStart) 529 start = cdsStart; 530 531 if (end > cdsEnd) 532 end = cdsEnd; 533 534 codingLength += (end - start + 1); 535 } 536 return codingLength - 3; 537 } 538 539 /** 540 * Get the length of the coding sequence 541 * 542 * @param exonStarts 543 * @param exonEnds 544 * @param cdsStart 545 * @param cdsEnd 546 * @return 547 */ 548 public static int getCDSLengthForward(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) { 549 550 int codingLength = 0; 551 552 for (int i = 0; i < exonStarts.size(); i++) { 553 554 int start = exonStarts.get(i)+base; 555 int end = exonEnds.get(i); 556 557 if ( (start < cdsStart+base && end < cdsStart) || (start > cdsEnd && end > cdsEnd) ) 558 continue; 559 560 if (start < cdsStart+base) 561 start = cdsStart+base; 562 563 if (end > cdsEnd) 564 end = cdsEnd; 565 566 codingLength += (end - start + 1); 567 } 568 return codingLength-3 ; 569 } 570 571 /** 572 * Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions) 573 * 574 * @param chromPos 575 * @return 576 */ 577 public static List<Range<Integer>> getCDSExonRanges(GeneChromosomePosition chromPos){ 578 if ( chromPos.getOrientation() == '+') 579 return getCDSExonRangesForward(chromPos,CDS); 580 return getCDSExonRangesReverse(chromPos,CDS); 581 } 582 583 /** Extracts the boundaries of the coding regions in chromosomal coordinates 584 * 585 * @param chromPos 586 * @return 587 */ 588 public static List<Range<Integer>> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){ 589 if ( chromPos.getOrientation() == '+') 590 return getCDSExonRangesForward(chromPos,CHROMOSOME); 591 return getCDSExonRangesReverse(chromPos,CHROMOSOME); 592 } 593 594 private static List<Range<Integer>> getCDSExonRangesReverse(GeneChromosomePosition chromPos, String responseType) { 595 596 List<Integer> exonStarts = chromPos.getExonStarts(); 597 List<Integer> exonEnds = chromPos.getExonEnds(); 598 599 List<Range<Integer>> data = new ArrayList<>(); 600 int cdsStart = chromPos.getCdsStart(); 601 int cdsEnd = chromPos.getCdsEnd(); 602 603 boolean inCoding = false; 604 int codingLength = 0; 605 606 if (cdsEnd < cdsStart) { 607 int tmp = cdsEnd; 608 cdsEnd = cdsStart; 609 cdsStart = tmp; 610 } 611 612 java.lang.StringBuffer s =null; 613 614 boolean debug = logger.isDebugEnabled(); 615 616 if ( debug) 617 s = new StringBuffer(); 618 619 //int lengthExons = 0; 620 621 // map reverse 622 for (int i = exonStarts.size() - 1; i >= 0; i--) { 623 624 int end = exonStarts.get(i); 625 int start = exonEnds.get(i); 626 627 if (end < start) { 628 int tmp = end; 629 end = start; 630 start = tmp; 631 } 632 //lengthExons += end - start; 633 //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start)); 634 //s.append(newline); 635 636 if (start <= cdsEnd && end >= cdsEnd) { 637 inCoding = true; 638 639 int tmpstart = start; 640 if (start < cdsStart) { 641 tmpstart = cdsStart; 642 } 643 codingLength += (cdsEnd - tmpstart); 644 if ( debug ) { 645 s.append(" UTR :").append(format(cdsEnd + 1)).append(" | ").append(format(end)); 646 s.append(newline); 647 if (tmpstart == start) 648 s.append(" -> "); 649 else 650 s.append(" <-> "); 651 s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3); 652 s.append(newline); 653 // single exon with UTR on both ends 654 if (tmpstart != start) 655 s.append(" UTR :").append(format(cdsStart)).append(" - ").append(format(start + 1)); 656 s.append(newline); 657 } 658 659 660 Range<Integer> r ; 661 if ( responseType.equals(CDS)) 662 r = Range.closed(0,codingLength); 663 else 664 r = Range.closed(tmpstart,cdsEnd); 665 666 data.add(r); 667 668 } else if (start <= cdsStart && end >= cdsStart) { 669 inCoding = false; 670 671 Range<Integer> r; 672 if ( responseType.equals(CDS)) 673 r = Range.closed(codingLength,codingLength+(end-cdsStart)); 674 else 675 r = Range.closed(cdsStart+1,end); 676 677 data.add(r); 678 679 codingLength += (end - cdsStart); 680 if (debug) { 681 s.append(" <- Exon : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); 682 s.append(newline); 683 s.append(" UTR : ").append(format(start + 1)).append(" - ").append(format(cdsStart)); 684 s.append(newline); 685 } 686 } else if (inCoding) { 687 // full exon is coding 688 Range<Integer> r; 689 if ( responseType.equals(CDS)) 690 r = Range.closed(codingLength,codingLength+(end-start)); 691 else 692 r = Range.closed(start,end); 693 data.add(r); 694 695 codingLength += (end - start); 696 if (debug) { 697 s.append(" Exon : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); 698 s.append(newline); 699 } 700 } else { 701 // e.g. see UBQLN3 702 if ( debug ) { 703 s.append(" no translation! UTR: " + format(start) + " - " + format(end)); 704 s.append(newline); 705 } 706 } 707 } 708 if ( debug ) { 709 s.append("CDS length: ").append(Integer.toString(codingLength - 3)); 710 s.append(newline); 711 logger.debug(s.toString()); 712 } 713 714 return data; 715 } 716 717 private static List<Range<Integer>> getCDSExonRangesForward(GeneChromosomePosition chromPos, String responseType) { 718 719 List<Range<Integer>> data = new ArrayList<>(); 720 List<Integer> exonStarts = chromPos.getExonStarts(); 721 List<Integer> exonEnds = chromPos.getExonEnds(); 722 723 int cdsStart = chromPos.getCdsStart(); 724 int cdsEnd = chromPos.getCdsEnd(); 725 726 boolean inCoding = false; 727 int codingLength = 0; 728 729 for (int i = 0; i < exonStarts.size(); i++) { 730 731 int start = exonStarts.get(i); 732 int end = exonEnds.get(i); 733 734 if (start <= cdsStart && end >= cdsStart) { 735 736 inCoding = true; 737 codingLength += (end - cdsStart); 738 739 Range<Integer> r; 740 if ( responseType.equals(CDS)) 741 r = Range.closed(0,codingLength); 742 else 743 r = Range.closed(cdsStart,end); 744 data.add(r); 745 746 } else if (start <= cdsEnd && end >= cdsEnd) { 747 //logger.debug(" <-- CDS end at: " + cdsEnd ); 748 inCoding = false; 749 750 Range<Integer> r; 751 if ( responseType.equals(CDS)) 752 r = Range.closed(codingLength,codingLength+(cdsEnd-start)); 753 else 754 r = Range.closed(start,cdsEnd); 755 data.add(r); 756 codingLength += (cdsEnd - start); 757 758 } else if (inCoding) { 759 // full exon is coding 760 Range<Integer> r; 761 if ( responseType.equals(CDS)) 762 r = Range.closed(codingLength,codingLength+(end-start)); 763 else 764 r = Range.closed(start,end); 765 data.add(r); 766 codingLength += (end - start); 767 } 768 } 769 return data; 770 } 771 772 /** 773 * I have a genomic coordinate, where is it on the mRNA 774 * 775 * @param coordinate 776 * @param chromosomePosition 777 * @return 778 */ 779 public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) { 780 781 if ( chromosomePosition.getOrientation() == '+') 782 return getCDSPosForward(coordinate, 783 chromosomePosition.getExonStarts(), 784 chromosomePosition.getExonEnds(), 785 chromosomePosition.getCdsStart(), 786 chromosomePosition.getCdsEnd()); 787 788 return getCDSPosReverse(coordinate, 789 chromosomePosition.getExonStarts(), 790 chromosomePosition.getExonEnds(), 791 chromosomePosition.getCdsStart(), 792 chromosomePosition.getCdsEnd()); 793 } 794 795 /** 796 * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 797 * living on the forward DNA strand. 798 * 799 * @param chromPos The genetic coordinate on a chromosome 800 * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 801 * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 802 * @param cdsStart The start position of a coding region 803 * @param cdsEnd The end position of a coding region 804 * 805 * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) 806 * 807 * @author Yana Valasatava 808 */ 809 public static int getCDSPosForward(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds, 810 int cdsStart, int cdsEnd) { 811 812 // the genetic coordinate is not in a coding region 813 if ( (chromPos < (cdsStart+base) ) || ( chromPos > (cdsEnd+base) ) ) { 814 logger.debug("The {} position is not in a coding region", format(chromPos)); 815 return -1; 816 } 817 818 logger.debug("looking for CDS position for {}", format(chromPos)); 819 820 // map the genetic coordinates of coding region on a stretch of a reverse strand 821 List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd); 822 823 int codingLength = 0; 824 int lengthExon = 0; 825 for (Range<Integer> range : cdsRegions) { 826 827 int start = range.lowerEndpoint(); 828 int end = range.upperEndpoint(); 829 830 lengthExon = end - start; 831 832 if (start+base <= chromPos && end >= chromPos ) { 833 return codingLength + (chromPos-start); 834 } 835 else { 836 codingLength += lengthExon; 837 } 838 } 839 return -1; 840 } 841 842 /** 843 * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 844 * living on the reverse DNA strand. 845 * 846 * @param chromPos The genetic coordinate on a chromosome 847 * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 848 * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 849 * @param cdsStart The start position of a coding region 850 * @param cdsEnd The end position of a coding region 851 * 852 * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) 853 * 854 * @author Yana Valasatava 855 */ 856 public static int getCDSPosReverse(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds, 857 int cdsStart, int cdsEnd) { 858 859 // the genetic coordinate is not in a coding region 860 if ( (chromPos < (cdsStart+base)) || ( chromPos > (cdsEnd+base) ) ) { 861 logger.debug("The {} position is not in a coding region", format(chromPos)); 862 return -1; 863 } 864 865 logger.debug("looking for CDS position for {}", format(chromPos)); 866 867 // map the genetic coordinate on a stretch of a reverse strand 868 List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd); 869 870 int codingLength = 0; 871 int lengthExon = 0; 872 for ( int i=cdsRegions.size()-1; i>=0; i-- ) { 873 874 int start = cdsRegions.get(i).lowerEndpoint(); 875 int end = cdsRegions.get(i).upperEndpoint(); 876 877 lengthExon = end - start; 878 // +1 offset to be a base 1 879 if (start+base <= chromPos && end >= chromPos ) { 880 return codingLength + (end-chromPos+1); 881 } 882 else { 883 codingLength += lengthExon; 884 } 885 } 886 return -1; 887 } 888 889 /** 890 * Extracts the exons boundaries in CDS coordinates corresponding to the forward DNA strand. 891 * 892 * @param origExonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 893 * @param origExonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 894 * @param cdsStart The start position of a coding region 895 * @param cdsEnd The end position of a coding region 896 * 897 * @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates 898 */ 899 public static List<Range<Integer>> getCDSRegions(List<Integer> origExonStarts, List<Integer> origExonEnds, int cdsStart, int cdsEnd) { 900 901 // remove exons that are fully landed in UTRs 902 List<Integer> exonStarts = new ArrayList<>(origExonStarts); 903 List<Integer> exonEnds = new ArrayList<>(origExonEnds); 904 905 int j=0; 906 for (int i = 0; i < origExonStarts.size(); i++) { 907 if ( ( origExonEnds.get(i) < cdsStart) || ( origExonStarts.get(i) > cdsEnd) ) { 908 exonStarts.remove(j); 909 exonEnds.remove(j); 910 } 911 else { 912 j++; 913 } 914 } 915 916 // remove untranslated regions from exons 917 int nExons = exonStarts.size(); 918 exonStarts.remove(0); 919 exonStarts.add(0, cdsStart); 920 exonEnds.remove(nExons-1); 921 exonEnds.add(cdsEnd); 922 923 List<Range<Integer>> cdsRegion = new ArrayList<>(); 924 for ( int i=0; i<nExons; i++ ) { 925 Range<Integer> r = Range.closed(exonStarts.get(i), exonEnds.get(i)); 926 cdsRegion.add(r); 927 } 928 return cdsRegion; 929 } 930 931 /** 932 * Extracts the DNA sequence transcribed from the input genetic coordinates. 933 * 934 * @param twoBitFacade the facade that provide an access to a 2bit file 935 * @param gcp The container with chromosomal positions 936 * 937 * @return the DNA sequence transcribed from the input genetic coordinates 938 */ 939 public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, GeneChromosomePosition gcp) throws Exception { 940 return getTranscriptDNASequence(twoBitFacade,gcp.getChromosome(),gcp.getExonStarts(), gcp.getExonEnds(), gcp.getCdsStart(), gcp.getCdsEnd(), gcp.getOrientation()); 941 } 942 943 /** 944 * Extracts the DNA sequence transcribed from the input genetic coordinates. 945 * 946 * @param chromosome the name of the chromosome 947 * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) 948 * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) 949 * @param cdsStart The start position of a coding region 950 * @param cdsEnd The end position of a coding region 951 * @param orientation The orientation of the strand where the gene is living 952 * 953 * @return the DNA sequence transcribed from the input genetic coordinates 954 */ 955 public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, String chromosome, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd, Character orientation) throws Exception { 956 957 List<Range<Integer>> cdsRegion = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd); 958 959 StringBuilder dnaSequence = new StringBuilder(); 960 for (Range<Integer> range : cdsRegion) { 961 String exonSequence = twoBitFacade.getSequence(chromosome,range.lowerEndpoint(), range.upperEndpoint()); 962 dnaSequence.append(exonSequence); 963 } 964 if (orientation.equals('-')) { 965 dnaSequence = new StringBuilder(new StringBuilder(dnaSequence.toString()).reverse().toString()); 966 DNASequence dna = new DNASequence(dnaSequence.toString()); 967 SequenceView<NucleotideCompound> compliment = dna.getComplement(); 968 dnaSequence = new StringBuilder(compliment.getSequenceAsString()); 969 } 970 return new DNASequence(dnaSequence.toString().toUpperCase()); 971 } 972}