001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.util; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.template.Sequence; 027 028public class SequenceTools { 029 030 protected static final String NUCLEOTIDE_LETTERS = "GCTAUXN"; 031 032 /** 033 * Cyclically permute the characters in {@code string} <em>forward</em> by {@code n} elements. 034 * @param string The string to permute 035 * @param n The number of characters to permute by; can be positive or negative; values greater than the length of the array are acceptable 036 */ 037 public static String permuteCyclic(String string, int n) { 038 // single letters are char[]; full names are Character[] 039 Character[] permuted = new Character[string.length()]; 040 char[] c = string.toCharArray(); 041 Character[] charArray = new Character[c.length]; 042 for (int i = 0; i < c.length; i++) { 043 charArray[i] = c[i]; 044 } 045 permuteCyclic(charArray, permuted, n); 046 char[] p = new char[permuted.length]; 047 for (int i = 0; i < p.length; i++) { 048 p[i] = permuted[i]; 049 } 050 return String.valueOf(p); 051 } 052 053 /** 054 * Improved implementation that is generally 10-100x faster, and fixes some edge-case bugs. 055 * @param string The string to permute 056 * @param n The number of characters to permute by; can be positive or negative; values greater than the length of the array are acceptable 057 * @return 058 */ 059 public static String permuteCyclic2(String string, int n) { 060 String toMutate = string + string; 061 n = n % string.length(); 062 if (n < 0) { 063 n = string.length() + n; 064 } 065 return toMutate.substring(n, n + string.length()); 066 } 067 068 /** 069 * Cyclically permute {@code array} <em>forward</em> by {@code n} elements. 070 * @param array The original result; will not be changed 071 * @param fill The permuted result will be filled into this array 072 * @param n The number of elements to permute by; can be positive or negative; values greater than the length of the array are acceptable 073 */ 074 public static <T> void permuteCyclic(T[] array, T[] fill, int n) { 075 if (array.length != fill.length) throw new IllegalArgumentException("Lengths do not match"); 076 if (n < 0) n = array.length + n; 077 while (n > array.length) { 078 n -= array.length; 079 } 080 for (int i = 0; i < array.length; i++) { 081 if (i + n < array.length) { 082 fill[i] = array[i + n]; 083 } else { 084 fill[i] = array[i - array.length + n]; 085 } 086 } 087 } 088 089 public static int percentNucleotideSequence(String sequence) 090 { 091 if (sequence == null || sequence.length() == 0) return 0; 092 093 int l = sequence.length(); 094 int n =0; 095 096 for (int i = 0; i < l; i++) 097 { 098 if (NUCLEOTIDE_LETTERS.indexOf(sequence.charAt(i)) < 0) 099 { 100 continue; 101 } 102 n++; 103 } 104 return (100 * n) / l; 105 } 106 107 public static boolean isNucleotideSequence(String sequence) 108 { 109 if (sequence == null || sequence.length() == 0) return false; 110 111 int l = sequence.length(); 112 for (int i = 0; i < l; i++) 113 { 114 if (NUCLEOTIDE_LETTERS.indexOf(sequence.charAt(i)) < 0) 115 { 116 return false; 117 } 118 } 119 return true; 120 } 121 122 /** 123 * Attempts to parse String as a DNA sequence first.<br/> 124 * If this fails it tries to parse as a ProteinSequence. 125 * <br/> 126 * This method does not attempt to create an RNASequence. 127 * <p> 128 * Also, a sequence such as 'ATCGTA' which is both a 129 * peptide sequence and a DNA sequence, will always be returned 130 * as a DNA sequence. 131 * </p> 132 * <p> 133 * An empty string argument returns a ProteinSequence of length 0. 134 * A null argument throws a {@link NullPointerException} 135 * @param sequence 136 * @return Either a DNASequence or a ProteinSequence 137 * @throws CompoundNotFoundException 138 */ 139 public Sequence<?> getSequenceFromString(String sequence) throws CompoundNotFoundException { 140 141 142 if( isNucleotideSequence(sequence)) { 143 return new DNASequence(sequence); 144 } else { 145 return new ProteinSequence(sequence); 146 147 } 148 149 } 150 151 /** A method to check whether an array of sequences contains at least two sequences having an equal length. 152 * 153 * @param sequences the array of {@link org.biojava.nbio.core.sequence.ProteinSequence} sequences 154 * @return true if any two sequences are of an equal length 155 */ 156 public static boolean equalLengthSequences(ProteinSequence[] sequences) { 157 158 for (int i=0; i<sequences.length-1; i++) { 159 if (sequences[i]==null) 160 continue; 161 for (int j=i+1; j<sequences.length; j++) { 162 if (sequences[j]==null) 163 continue; 164 if (sequences[i].getLength() == sequences[j].getLength()) 165 return true; 166 } 167 } 168 return false; 169 } 170}