001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.aaproperties; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.util.HashSet; 027import java.util.Set; 028 029/** 030 * This is a utility class that contains utility methods which will facilitates the coding of other methods 031 * 032 * @author kohchuanhock 033 * @version 2011.08.22 034 * @since 3.0.2 035 */ 036public class Utils { 037 038 private final static Logger logger = LoggerFactory.getLogger(Utils.class); 039 040 /** 041 * Returns a value with the desired number of decimal places. 042 * 043 * @param d 044 * value to round 045 * @param c 046 * number of decimal places desired. 047 * Must be greater or equal to zero, otherwise, the given value d would be returned without any modification. 048 * @return 049 * a value with the given number of decimal places. 050 */ 051 public final static double roundToDecimals(double d, int c) { 052 if(c < 0) return d; 053 double p = Math.pow(10,c); 054 d = d * p; 055 double tmp = Math.round(d); 056 return tmp/p; 057 } 058 059 /** 060 * Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false. 061 * Note that any characters are deemed as valid only if it is found in cSet. 062 * 063 * @param sequence 064 * protein sequence to be check. 065 * @param cSet 066 * the set of characters that are deemed valid. 067 * @return 068 * true if invalid characters are found, else return false. 069 */ 070 public final static boolean doesSequenceContainInvalidChar(String sequence, Set<Character> cSet){ 071 for(char c:sequence.toCharArray()){ 072 if(!cSet.contains(c)) return true; 073 } 074 return false; 075 } 076 077 /** 078 * Return the number of invalid characters in sequence. 079 * 080 * @param sequence 081 * protein sequence to count for invalid characters. 082 * @param cSet 083 * the set of characters that are deemed valid. 084 * @param ignoreCase 085 * indicates if cases should be ignored 086 * @return 087 * the number of invalid characters in sequence. 088 */ 089 public final static int getNumberOfInvalidChar(String sequence, Set<Character> cSet, boolean ignoreCase){ 090 int total = 0; 091 char[] cArray; 092 if(ignoreCase) cArray = sequence.toUpperCase().toCharArray(); 093 else cArray = sequence.toCharArray(); 094 if(cSet == null) cSet = PeptideProperties.standardAASet; 095 for(char c:cArray){ 096 if(!cSet.contains(c)) total++; 097 } 098 return total; 099 } 100 101 /** 102 * Returns a new sequence with all invalid characters being replaced by '-'. 103 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 104 * 105 * @param sequence 106 * protein sequence to be clean 107 * @param cSet 108 * user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid. 109 * @return 110 * a new sequence with all invalid characters being replaced by '-'. 111 */ 112 public final static String cleanSequence(String sequence, Set<Character> cSet){ 113 Set<Character> invalidCharSet = new HashSet<Character>(); 114 StringBuilder cleanSeq = new StringBuilder(); 115 if(cSet == null) cSet = PeptideProperties.standardAASet; 116 for(char c:sequence.toCharArray()){ 117 if(!cSet.contains(c)){ 118 cleanSeq.append("-"); 119 invalidCharSet.add(c); 120 }else{ 121 cleanSeq.append(c); 122 } 123 } 124 125 // TODO: Should be StringJoiner once JDK8 used 126 StringBuilder stringBuilder = new StringBuilder(); 127 for(char c: invalidCharSet){ 128 stringBuilder.append("\'" + c + "\'"); 129 } 130 stringBuilder.deleteCharAt(stringBuilder.length()-1); 131 stringBuilder.append(" are being replaced with '-'"); 132 logger.warn(stringBuilder.toString()); 133 134 return cleanSeq.toString(); 135 } 136 137 /** 138 * Checks if the sequence contains invalid characters. 139 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 140 * If yes, it will return a new sequence where invalid characters are replaced with '-'. 141 * If no, it will simply return the input sequence. 142 * 143 * @param sequence 144 * protein sequence to be check for invalid characters. 145 * @return 146 * a sequence with no invalid characters. 147 */ 148 public static final String checkSequence(String sequence){ 149 return checkSequence(sequence, null); 150 } 151 152 /** 153 * Checks if the sequence contains invalid characters. 154 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 155 * If yes, it will return a new sequence where invalid characters are replaced with '-'. 156 * If no, it will simply return the input sequence. 157 * 158 * @param sequence 159 * protein sequence to be check for invalid characters. 160 * @param cSet 161 * character set which define the valid characters. 162 * @return 163 * a sequence with no invalid characters. 164 */ 165 public static final String checkSequence(String sequence, Set<Character> cSet){ 166 boolean containInvalid = false; 167 if(cSet != null){ 168 containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet); 169 }else{ 170 containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet); 171 } 172 if(containInvalid){ 173 String cSeq = cleanSequence(sequence, cSet); 174 logger.warn("There exists invalid characters in the sequence. Computed results might not be precise."); 175 logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence."); 176 177 return cSeq; 178 } 179 else{ 180 return sequence; 181 } 182 } 183}