001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.location; 023 024import org.biojava.nbio.core.exceptions.ParserException; 025import org.biojava.nbio.core.sequence.AccessionID; 026import org.biojava.nbio.core.sequence.DataSource; 027import org.biojava.nbio.core.sequence.Strand; 028import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 029import org.biojava.nbio.core.sequence.location.template.Location; 030import org.biojava.nbio.core.sequence.location.template.Point; 031 032import java.util.ArrayList; 033import java.util.List; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037/** 038 * Parser for working with INSDC style locations. This class supports the 039 * full range of location types generated by Genbank, INSDC and ENA. 040 * 041 * @author ayates 042 * @author jgrzebyta 043 * @author Paolo Pavan 044 */ 045public class InsdcParser { 046 047 private boolean isSequenceCircular; 048 private long sequenceLength; 049 050 private final DataSource dataSource; 051 052 /** 053 * parse a location. if group(1) is null than the feature is on the positive 054 * strand, group(2) start position, group(3) end position. 055 */ 056 protected static final Pattern singleLocationPattern = Pattern.compile("(?:([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?"); 057 /** 058 * Decodes a split pattern. Split patterns are a composition of multiple 059 * locationsString qualified by actions: join(location,location, ... 060 * location): The indicated elements should be joined (placed end-to-end) to 061 * form one contiguous sequence. order(location,location, ... location): The 062 * elements can be found in the specified order (5' to 3' direction), 063 * nothing is implied about their reasonableness 064 * bond(location,location...location): Found in protein files. These 065 * generally are used to describe disulfide bonds. 066 * complement(location,location...location): consider locations in their 067 * complement versus 068 * 069 * takes in input a comma splitted location string. The split must be done 070 * for outer level commas group(1) is the qualifier group(2) is the location 071 * string to getFeatures. In case of complex splits it will contain the 072 * nested expression 073 * 074 * Not really sure that they are not declared obsolete but they are still in 075 * several files. 076 */ 077 protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?(.+)\\)?"); 078 /** 079 * designed to recursively split a location string in tokens. Valid tokens 080 * are those divided by coma that are not inside a bracket. I. e. split on 081 * the comma only if that comma has zero, or an even number of quotes ahead 082 * of it. 083 */ 084 protected static final String locationSplitPattern = ",(?=([^\\(|\\)]+\\([^\\(|\\)]+\\))[^\\(|\\)]+)"; 085 /** 086 * these variables are used to compute the global start and end of complex 087 * features 088 */ 089 protected Integer featureGlobalStart, featureGlobalEnd; 090 091 enum complexFeaturesAppendEnum { 092 093 FLATTEN, HIERARCHICAL; 094 } 095 /** 096 * define the mode in which complex features should be appended in FLATTEN 097 * mode their will be appended to a single feature in HIERARCHICAL mode, the 098 * single mother feature will have a tree of features that will reflect the 099 * construction in genbank file 100 */ 101 private complexFeaturesAppendEnum complexFeaturesAppendMode = complexFeaturesAppendEnum.HIERARCHICAL; 102 103 public void setComplexFeaturesAppendMode(complexFeaturesAppendEnum complexFeaturesAppendMode) { 104 this.complexFeaturesAppendMode = complexFeaturesAppendMode; 105 } 106 107 public InsdcParser() { 108 this(DataSource.ENA); 109 } 110 111 public InsdcParser(DataSource dataSource) { 112 this.dataSource = dataSource; 113 } 114 115 public DataSource getDataSource() { 116 return dataSource; 117 } 118 119 public void setSequenceCircular(boolean sequenceCircular) { 120 isSequenceCircular = sequenceCircular; 121 } 122 123 public void setSequenceLength(long sequenceLength) { 124 this.sequenceLength = sequenceLength; 125 } 126 127 /** 128 * Main method for parsing a location from a String instance 129 * 130 * @param locationString Represents a logical location 131 * @return The parsed location 132 * @throws ParserException thrown in the event of any error during parsing 133 */ 134 public Location parse(String locationString) throws ParserException { 135 featureGlobalStart = Integer.MAX_VALUE; 136 featureGlobalEnd = 1; 137 138 Location l; 139 List<Location> ll = parseLocationString(locationString, 1); 140 141 if (ll.size() == 1) { 142 l = ll.get(0); 143 } else { 144 l = new SimpleLocation( 145 new SimplePoint(featureGlobalStart), 146 new SimplePoint(featureGlobalEnd), 147 Strand.UNDEFINED, 148 isSequenceCircular, 149 ll); 150 } 151 return l; 152 } 153 154 private List<Location> parseLocationString(String string, int versus) throws ParserException { 155 Matcher m; 156 List<Location> boundedLocationsCollection = new ArrayList<Location>(); 157 158 List<String> tokens = splitString(string); 159 for (String t : tokens) { 160 m = genbankSplitPattern.matcher(t); 161 if (!m.find()) { 162 throw new ParserException("Cannot interpret split pattern " + t 163 + "\nin location string:" + string); 164 } 165 String splitQualifier = m.group(1); 166 String splitString = m.group(2); 167 168 if (!splitQualifier.isEmpty()) { 169 //recursive case 170 int localVersus = splitQualifier.equalsIgnoreCase("complement") ? -1 : 1; 171 List<Location> subLocations = parseLocationString( 172 splitString, versus * localVersus); 173 174 switch (complexFeaturesAppendMode) { 175 case FLATTEN: 176 boundedLocationsCollection.addAll(subLocations); 177 break; 178 case HIERARCHICAL: 179 if (subLocations.size() == 1) { 180 boundedLocationsCollection.addAll(subLocations); 181 } else { 182 Point min = Location.Tools.getMin(subLocations).getStart(); 183 Point max = Location.Tools.getMax(subLocations).getEnd(); 184 AbstractLocation motherLocation 185 = new SimpleLocation( 186 min, 187 max 188 ); 189 190 if (splitQualifier.equalsIgnoreCase("join")) { 191 motherLocation = new InsdcLocations.GroupLocation(subLocations); 192 } 193 if (splitQualifier.equalsIgnoreCase("order")) { 194 motherLocation = new InsdcLocations.OrderLocation(subLocations); 195 } 196 if (splitQualifier.equalsIgnoreCase("bond")) { 197 motherLocation = new InsdcLocations.BondLocation(subLocations); 198 } 199 motherLocation.setStrand(getGroupLocationStrand(subLocations)); 200 boundedLocationsCollection.add(motherLocation); 201 } 202 break; 203 } 204 } else { 205 //base case 206 m = singleLocationPattern.matcher(splitString); 207 if (!m.find()) { 208 throw new ParserException("Cannot interpret location pattern " + splitString 209 + "\nin location string:" + string); 210 } 211 212 String accession = m.group(1); 213 Strand s = versus == 1 ? Strand.POSITIVE : Strand.NEGATIVE; 214 int start = Integer.valueOf(m.group(3)); 215 int end = m.group(6) == null ? start : Integer.valueOf(m.group(6)); 216 217 if (featureGlobalStart > start) { 218 featureGlobalStart = start; 219 } 220 if (featureGlobalEnd < end) { 221 featureGlobalEnd = end; 222 } 223 224 AbstractLocation l; 225 if (start <= end) { 226 l = new SimpleLocation( 227 start, 228 end, 229 s 230 ); 231 } else { 232 // in case of location spanning the end point, Location contract wants sublocations 233 AbstractLocation l5prime = new SimpleLocation( 234 1, 235 end, 236 Strand.UNDEFINED 237 ); 238 AbstractLocation l3prime = new SimpleLocation( 239 start, 240 (int) sequenceLength, 241 Strand.UNDEFINED 242 ); 243 244 l = new InsdcLocations.GroupLocation( 245 new SimplePoint(start), 246 new SimplePoint(end), 247 s, 248 isSequenceCircular, 249 l5prime, l3prime 250 ); 251 252 } 253 254 if(m.group(4) != null && m.group(4).equals("^")) l.setBetweenCompounds(true); 255 256 if (m.group(2).equals("<")) { 257 l.setPartialOn5prime(true); 258 } 259 if (m.group(5) != null && (m.group(5).equals(">") || m.group(7).equals(">"))) { 260 l.setPartialOn3prime(true); 261 } 262 263 if (!(accession == null || "".equals(accession))) l.setAccession(new AccessionID(accession)); 264 265 boundedLocationsCollection.add(l); 266 267 } 268 } 269 270 return boundedLocationsCollection; 271 } 272 273 274 private List<String> splitString(String input) { 275 List<String> result = new ArrayList<String>(); 276 int start = 0; 277 int openedParenthesis = 0; 278 for (int current = 0; current < input.length(); current++) { 279 if (input.charAt(current) == '(') { 280 openedParenthesis++; 281 } 282 if (input.charAt(current) == ')') { 283 openedParenthesis--; 284 } 285 boolean atLastChar = (current == input.length() - 1); 286 if (atLastChar) { 287 result.add(input.substring(start)); 288 } else if (input.charAt(current) == ',' && openedParenthesis == 0) { 289 result.add(input.substring(start, current)); 290 start = current + 1; 291 } 292 } 293 return result; 294 } 295 296 private Strand getGroupLocationStrand(List<Location> ll){ 297 Strand returnStrand = null; 298 299 for (Location l: ll) { 300 if (returnStrand == null) returnStrand = l.getStrand(); 301 if (returnStrand != l.getStrand()) return Strand.UNDEFINED; 302 } 303 return returnStrand; 304 } 305 306 public static void main(String[] args){ 307 String[] testStrings = { 308 "J00194.1:100..202", 309 "A00001.5:34..45", 310 "43..129", 311 "bond(55,110)", 312 "bond(34,35),join(56..80),complement(45,73)", 313 "order(complement(30,40),70..80),bond(34,35),join(56,80),complement(45..56)", 314 "join(join(complement(30,40),complement(70..80)),bond(34,35),join(56,80),complement(45..56))", 315 "complement(join(complement(2000..4000),complement(70..80)),bond(34,35),join(56,80),complement(45..56))", 316 317 }; 318 InsdcParser p = new InsdcParser(); 319 p.setComplexFeaturesAppendMode(complexFeaturesAppendEnum.HIERARCHICAL); 320 321 for (String s: testStrings){ 322 Location l = p.parse(s); 323 System.out.println(l.toString()); 324 } 325 326 } 327 328}