001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.aaproperties; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.nio.CharBuffer; 027import java.util.HashSet; 028import java.util.Set; 029 030/** 031 * This is a utility class that contains utility methods which will facilitates the coding of other methods 032 * 033 * @author kohchuanhock 034 * @version 2011.08.22 035 * @since 3.0.2 036 */ 037public class Utils { 038 039 private final static Logger logger = LoggerFactory.getLogger(Utils.class); 040 041 /** 042 * Returns a value with the desired number of decimal places. 043 * 044 * @param d 045 * value to round 046 * @param c 047 * number of decimal places desired. 048 * Must be greater or equal to zero, otherwise, the given value d would be returned without any modification. 049 * @return 050 * a value with the given number of decimal places. 051 */ 052 public final static double roundToDecimals(double d, int c) { 053 if(c < 0) return d; 054 double p = Math.pow(10,c); 055 d = d * p; 056 double tmp = Math.round(d); 057 return tmp/p; 058 } 059 060 /** 061 * Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false. 062 * Note that any characters are deemed as valid only if it is found in cSet. 063 * 064 * @param sequence 065 * protein sequence to be check. 066 * @param cSet 067 * the set of characters that are deemed valid. 068 * @return 069 * true if invalid characters are found, else return false. 070 */ 071 public final static boolean doesSequenceContainInvalidChar(String sequence, Set<Character> cSet){ 072 for(char c:sequence.toCharArray()){ 073 if(!cSet.contains(c)) return true; 074 } 075 return false; 076 } 077 078 /** 079 * Return the number of invalid characters in sequence. 080 * 081 * @param sequence 082 * protein sequence to count for invalid characters. 083 * @param cSet 084 * the set of characters that are deemed valid. 085 * @param ignoreCase 086 * indicates if cases should be ignored 087 * @return 088 * the number of invalid characters in sequence. 089 */ 090 public final static int getNumberOfInvalidChar(String sequence, Set<Character> cSet, boolean ignoreCase){ 091 char[] cArray = ignoreCase ? sequence.toUpperCase().toCharArray(): sequence.toCharArray(); 092 final Set<Character> characterSet = cSet == null ?PeptideProperties.standardAASet: cSet ; 093 int total = (int)CharBuffer.wrap(cArray).chars().filter(character -> !characterSet.contains((char)character)).count(); 094 return total; 095 } 096 097 /** 098 * Returns a new sequence with all invalid characters being replaced by '-'. 099 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 100 * 101 * @param sequence 102 * protein sequence to be clean 103 * @param cSet 104 * user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid. 105 * @return 106 * a new sequence with all invalid characters being replaced by '-'. 107 */ 108 public final static String cleanSequence(String sequence, Set<Character> cSet){ 109 Set<Character> invalidCharSet = new HashSet<Character>(); 110 StringBuilder cleanSeq = new StringBuilder(); 111 if(cSet == null) cSet = PeptideProperties.standardAASet; 112 for(char c:sequence.toCharArray()){ 113 if(!cSet.contains(c)){ 114 cleanSeq.append("-"); 115 invalidCharSet.add(c); 116 }else{ 117 cleanSeq.append(c); 118 } 119 } 120 121 // TODO: Should be StringJoiner once JDK8 used 122 StringBuilder stringBuilder = new StringBuilder(); 123 for(char c: invalidCharSet){ 124 stringBuilder.append("\'" + c + "\'"); 125 } 126 stringBuilder.deleteCharAt(stringBuilder.length()-1); 127 stringBuilder.append(" are being replaced with '-'"); 128 logger.warn(stringBuilder.toString()); 129 130 return cleanSeq.toString(); 131 } 132 133 /** 134 * Checks if the sequence contains invalid characters. 135 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 136 * If yes, it will return a new sequence where invalid characters are replaced with '-'. 137 * If no, it will simply return the input sequence. 138 * 139 * @param sequence 140 * protein sequence to be check for invalid characters. 141 * @return 142 * a sequence with no invalid characters. 143 */ 144 public static final String checkSequence(String sequence){ 145 return checkSequence(sequence, null); 146 } 147 148 /** 149 * Checks if the sequence contains invalid characters. 150 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid. 151 * If yes, it will return a new sequence where invalid characters are replaced with '-'. 152 * If no, it will simply return the input sequence. 153 * 154 * @param sequence 155 * protein sequence to be check for invalid characters. 156 * @param cSet 157 * character set which define the valid characters. 158 * @return 159 * a sequence with no invalid characters. 160 */ 161 public static final String checkSequence(String sequence, Set<Character> cSet){ 162 boolean containInvalid = false; 163 if(cSet != null){ 164 containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet); 165 }else{ 166 containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet); 167 } 168 if(containInvalid){ 169 String cSeq = cleanSequence(sequence, cSet); 170 logger.warn("There exists invalid characters in the sequence. Computed results might not be precise."); 171 logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence."); 172 173 return cSeq; 174 } 175 else{ 176 return sequence; 177 } 178 } 179}