001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.location; 023 024import org.biojava.nbio.core.exceptions.ParserException; 025import org.biojava.nbio.core.sequence.AccessionID; 026import org.biojava.nbio.core.sequence.DNASequence; 027import org.biojava.nbio.core.sequence.DataSource; 028import org.biojava.nbio.core.sequence.Strand; 029import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 030import org.biojava.nbio.core.sequence.location.template.Location; 031import org.biojava.nbio.core.sequence.location.template.Point; 032import org.biojava.nbio.core.sequence.template.AbstractSequence; 033import org.biojava.nbio.core.sequence.template.Compound; 034 035import java.io.IOException; 036import java.io.Reader; 037import java.util.ArrayList; 038import java.util.List; 039import java.util.regex.Matcher; 040import java.util.regex.Pattern; 041 042/** 043 * Parser for working with INSDC style locations. This class supports the 044 * full range of location types generated by Genbank, INSDC and ENA. 045 * 046 * @author ayates 047 * @author jgrzebyta 048 * @author Paolo Pavan 049 */ 050public class InsdcParser <S extends AbstractSequence<C>, C extends Compound>{ 051 052 private final DataSource dataSource; 053 054 /** 055 * parse a location. if group(1) is null than the feature is on the positive 056 * strand, group(2) start position, group(3) end position. 057 */ 058 // why in the location the first character was ignored? 059 //protected static final Pattern singleLocationPattern = Pattern.compile("(?:[A-Z]([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?"); 060 061 // fixed issue #254 062 protected static final Pattern singleLocationPattern = Pattern.compile("(?:([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?"); 063 /** 064 * Decodes a split pattern. Split patterns are a composition of multiple 065 * locationsString qualified by actions: join(location,location, ... 066 * location): The indicated elements should be joined (placed end-to-end) to 067 * form one contiguous sequence. order(location,location, ... location): The 068 * elements can be found in the specified order (5' to 3' direction), 069 * nothing is implied about their reasonableness 070 * bond(location,location...location): Found in protein files. These 071 * generally are used to describe disulfide bonds. 072 * complement(location,location...location): consider locations in their 073 * complement versus 074 * 075 * takes in input a comma splitted location string. The split must be done 076 * for outer level commas group(1) is the qualifier group(2) is the location 077 * string to getFeatures. In case of complex splits it will contain the 078 * nested expression 079 * 080 * Not really sure that they are not declared obsolete but they are still in 081 * several files. 082 */ 083 //protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?([^\\)]+)\\)?"); 084 protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?(.+)\\)?"); 085 /** 086 * designed to recursively split a location string in tokens. Valid tokens 087 * are those divided by coma that are not inside a bracket. I. e. split on 088 * the comma only if that comma has zero, or an even number of quotes ahead 089 * of it. 090 */ 091 protected static final String locationSplitPattern = ",(?=([^\\(|\\)]+\\([^\\(|\\)]+\\))[^\\(|\\)]+)"; 092 /** 093 * these variables are used to compute the global start and end of complex 094 * features 095 */ 096 protected Integer featureGlobalStart, featureGlobalEnd; 097 098 //private S referenceSequence = new org.biojava.nbio.core.sequence.DNASequence(); 099 private AbstractSequence referenceSequence = new DNASequence(); 100 101 enum complexFeaturesAppendEnum { 102 103 FLATTEN, HIERARCHICAL; 104 } 105 /** 106 * define the mode in which complex features should be appended in FLATTEN 107 * mode their will be appended to a single feature in HIERARCHICAL mode, the 108 * single mother feature will have a tree of features that will reflect the 109 * construction in genbank file 110 */ 111 private complexFeaturesAppendEnum complexFeaturesAppendMode = complexFeaturesAppendEnum.HIERARCHICAL; 112 113 public void setComplexFeaturesAppendMode(complexFeaturesAppendEnum complexFeaturesAppendMode) { 114 this.complexFeaturesAppendMode = complexFeaturesAppendMode; 115 } 116 117 public InsdcParser() { 118 this(DataSource.ENA); 119 } 120 121 public InsdcParser(DataSource dataSource) { 122 this.dataSource = dataSource; 123 } 124 125 public DataSource getDataSource() { 126 return dataSource; 127 } 128 129 130 131 /** 132 * Main method for parsing a location from a String instance 133 * 134 * @param locationString Represents a logical location 135 * @return The parsed location 136 * @throws ParserException thrown in the event of any error during parsing 137 */ 138 public Location parse(String locationString) throws ParserException { 139 featureGlobalStart = Integer.MAX_VALUE; 140 featureGlobalEnd = 1; 141 142 Location l; 143 List<Location> ll = parseLocationString(locationString, 1); 144 145 if (ll.size() == 1) { 146 l = ll.get(0); 147 } else { 148 l = new SimpleLocation( 149 featureGlobalStart, 150 featureGlobalEnd, 151 Strand.UNDEFINED, 152 ll); 153 } 154 return l; 155 } 156 157 /** 158 * Reader based version of the parse methods. 159 * 160 * @param reader The source of the data; assumes that end of the reader 161 * stream is the end of the location string to parse 162 * @return The parsed location 163 * @throws IOException Thrown with any reader error 164 * @throws ParserException Thrown with any error with parsing locations 165 */ 166 public List<AbstractLocation> parse(Reader reader) throws IOException, ParserException { 167 // use parse(String s) instead! 168 return null; 169 } 170 171 private List<Location> parseLocationString(String string, int versus) throws ParserException { 172 Matcher m; 173 List<Location> boundedLocationsCollection = new ArrayList<Location>(); 174 175 //String[] tokens = string.split(locationSplitPattern); 176 List<String> tokens = splitString(string); 177 for (String t : tokens) { 178 m = genbankSplitPattern.matcher(t); 179 if (!m.find()) { 180 throw new ParserException("Cannot interpret split pattern " + t 181 + "\nin location string:" + string); 182 } 183 String splitQualifier = m.group(1); 184 String splitString = m.group(2); 185 186 if (!splitQualifier.isEmpty()) { 187 //recursive case 188 int localVersus = splitQualifier.equalsIgnoreCase("complement") ? -1 : 1; 189 List<Location> subLocations = parseLocationString(splitString, versus * localVersus); 190 191 switch (complexFeaturesAppendMode) { 192 case FLATTEN: 193 boundedLocationsCollection.addAll(subLocations); 194 break; 195 case HIERARCHICAL: 196 if (subLocations.size() == 1) { 197 boundedLocationsCollection.addAll(subLocations); 198 } else { 199 Point min = Location.Tools.getMin(subLocations).getStart(); 200 Point max = Location.Tools.getMax(subLocations).getEnd(); 201 AbstractLocation motherLocation 202 = new SimpleLocation( 203 min, 204 max 205 ); 206 207 if (splitQualifier.equalsIgnoreCase("join")) { 208 motherLocation = new InsdcLocations.GroupLocation(subLocations); 209 } 210 if (splitQualifier.equalsIgnoreCase("order")) { 211 motherLocation = new InsdcLocations.OrderLocation(subLocations); 212 } 213 if (splitQualifier.equalsIgnoreCase("bond")) { 214 motherLocation = new InsdcLocations.BondLocation(subLocations); 215 } 216 motherLocation.setStrand(getGroupLocationStrand(subLocations)); 217 boundedLocationsCollection.add(motherLocation); 218 } 219 break; 220 } 221 } else { 222 //base case 223 m = singleLocationPattern.matcher(splitString); 224 if (!m.find()) { 225 throw new ParserException("Cannot interpret location pattern " + splitString 226 + "\nin location string:" + string); 227 } 228 229 String accession = m.group(1); 230 Strand s = versus == 1 ? Strand.POSITIVE : Strand.NEGATIVE; 231 int start = Integer.parseInt(m.group(3)); 232 int end = m.group(6) == null ? start : new Integer(m.group(6)); 233 234 if (featureGlobalStart > start) { 235 featureGlobalStart = start; 236 } 237 if (featureGlobalEnd < end) { 238 featureGlobalEnd = end; 239 } 240 241 AbstractLocation l = new SimpleLocation( 242 start, 243 end, 244 s 245 ); 246 247 if(m.group(4) != null && m.group(4).equals("^")) l.setBetweenCompounds(true); 248 249 if (m.group(2).equals("<")) { 250 l.setPartialOn5prime(true); 251 } 252 if (m.group(5) != null && (m.group(5).equals(">") || m.group(7).equals(">"))) { 253 l.setPartialOn3prime(true); 254 } 255 256 if (!(accession == null || "".equals(accession))) l.setAccession(new AccessionID(accession)); 257 258 boundedLocationsCollection.add(l); 259 260 } 261 } 262 263 return boundedLocationsCollection; 264 } 265 266 267 private List<String> splitString(String input) { 268 List<String> result = new ArrayList<String>(); 269 int start = 0; 270 int openedParenthesis = 0; 271 for (int current = 0; current < input.length(); current++) { 272 if (input.charAt(current) == '(') { 273 openedParenthesis++; 274 } 275 if (input.charAt(current) == ')') { 276 openedParenthesis--; 277 } 278 boolean atLastChar = (current == input.length() - 1); 279 if (atLastChar) { 280 result.add(input.substring(start)); 281 } else if (input.charAt(current) == ',' && openedParenthesis == 0) { 282 result.add(input.substring(start, current)); 283 start = current + 1; 284 } 285 } 286 return result; 287 } 288 289 private Strand getGroupLocationStrand(List<Location> ll){ 290 Strand returnStrand = null; 291 292 for (Location l: ll) { 293 if (returnStrand == null) returnStrand = l.getStrand(); 294 if (returnStrand != l.getStrand()) return Strand.UNDEFINED; 295 } 296 return returnStrand; 297 } 298 299 public static void main(String[] args){ 300 String[] testStrings = { 301 "J00194.1:100..202", 302 "A00001.5:34..45", 303 "43..129", 304 "bond(55,110)", 305 "bond(34,35),join(56..80),complement(45,73)", 306 "order(complement(30,40),70..80),bond(34,35),join(56,80),complement(45..56)", 307 "join(join(complement(30,40),complement(70..80)),bond(34,35),join(56,80),complement(45..56))", 308 "complement(join(complement(2000..4000),complement(70..80)),bond(34,35),join(56,80),complement(45..56))", 309 310 }; 311 InsdcParser p = new InsdcParser(); 312 p.setComplexFeaturesAppendMode(complexFeaturesAppendEnum.HIERARCHICAL); 313 314 for (String s: testStrings){ 315 Location l = p.parse(s); 316 System.out.println(l.toString()); 317 } 318 319 } 320 321}