001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021/**
022 *
023 */
024package org.biojava.nbio.core.sequence.io;
025
026import org.biojava.nbio.core.sequence.Strand;
027import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
028import org.biojava.nbio.core.sequence.features.FeatureInterface;
029import org.biojava.nbio.core.sequence.features.Qualifier;
030import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
031import org.biojava.nbio.core.sequence.location.template.Location;
032import org.biojava.nbio.core.sequence.location.template.Point;
033import org.biojava.nbio.core.sequence.template.AbstractSequence;
034import org.biojava.nbio.core.sequence.template.Compound;
035import org.biojava.nbio.core.util.StringManipulationHelper;
036
037import java.util.ArrayList;
038import java.util.Collections;
039import java.util.Formatter;
040import java.util.List;
041import java.util.Locale;
042
043/**
044 * @author mckeee1
045 *
046 */
047public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> {
048        protected static final int MAX_WIDTH = 80;
049        protected static final int QUALIFIER_INDENT = 21;
050        protected static final String QUALIFIER_INDENT_STR = "                     ";
051        protected static final String QUALIFIER_INDENT_TMP = "     %s                ";
052        private static final String lineSep = "%n";
053
054        /**
055         * Format a feature qualifier using the MAX_WIDTH (default 80)
056         * @param key
057         * @param value
058         * @param quote
059         */
060        private String _write_feature_qualifier(String key, String value, boolean quote) {
061                String line = "";
062                if(null == value) {
063                        line = QUALIFIER_INDENT_STR + "/" + key + lineSep;
064                        return line;
065                }
066                if(quote) {  // quote should be true for numerics
067                        line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\"";
068                } else {
069                        line = QUALIFIER_INDENT_STR + "/" + key + "=" + value;
070                }
071                if(line.length() <= MAX_WIDTH) {
072                        return line + lineSep;
073                }
074                String goodlines = "";
075                while(!"".equals(line.replaceAll("^\\s+", ""))) {
076                        if(line.length() <= MAX_WIDTH) {
077                                goodlines += line + lineSep;
078                                break;
079                        }
080                        //Insert line break...
081                        int index;
082                        for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) {
083                                if(' ' == line.charAt(index)) {
084                                        break;
085                                }
086                        }
087                        if(' ' != line.charAt(index)) {
088                                //no nice place to break...
089                                index = MAX_WIDTH;
090                        }
091                        assert index <= MAX_WIDTH;
092                        goodlines += line.substring(0,index) + lineSep;
093                        line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", "");
094                }
095                return goodlines;
096        }
097        /**
098         * Split a feature location into lines (break at commas).
099         * @param location
100         */
101        private String _wrap_location(String location) {
102                int length = MAX_WIDTH - QUALIFIER_INDENT;
103                if(location.length() <= length) {
104                        return location;
105                }
106                int index = location.substring(0, length).lastIndexOf(",");
107                if(-1 == index) {
108                        //No good place to split (!)
109                        return location;
110                }
111                return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1));
112        }
113        /**
114         * Write a single SeqFeature object to features table.
115         * @param feature
116         * @param record_length
117         */
118        protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
119                String location = _insdc_feature_location_string(feature, record_length);
120                String f_type = feature.getType().replace(" ", "_");
121                StringBuilder sb = new StringBuilder();
122                Formatter formatter = new Formatter(sb,Locale.US);
123                formatter.format(QUALIFIER_INDENT_TMP, f_type);
124                String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep;
125                formatter.close();
126
127                //Now the qualifiers...
128                for(List<Qualifier>  qualifiers : feature.getQualifiers().values()) {
129                        for(Qualifier q : qualifiers){
130                                if (q instanceof DBReferenceInfo) {
131                                        DBReferenceInfo db = (DBReferenceInfo) q;
132                                        line += _write_feature_qualifier(q.getName().replaceAll("%","%%"), db.getDatabase().replaceAll("%","%%") + ":" + db.getId().replaceAll("%","%%"), db.needsQuotes());    
133                                } else {
134                                        line += _write_feature_qualifier(q.getName().replaceAll("%","%%"), q.getValue().replaceAll("%","%%"), q.needsQuotes());
135                                }
136                        }
137                }
138                return line;
139                /*
140                self.handle.write(line)
141                #Now the qualifiers...
142                for key, values in feature.qualifiers.items():
143                        if isinstance(values, list) or isinstance(values, tuple):
144                                for value in values:
145                                        self._write_feature_qualifier(key, value)
146                        elif values:
147                                #String, int, etc
148                                self._write_feature_qualifier(key, values)
149                        else:
150                                #e.g. a /psuedo entry
151                                self._write_feature_qualifier(key)
152                 */
153        }
154        /**
155         * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).
156
157        There is a choice of how to show joins on the reverse complement strand,
158        GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
159        "join(complement(20,100),complement(1,10))" instead (but appears to have
160        now adopted the GenBank convention). Notice that the order of the entries
161        is reversed! This function therefore uses the first form. In this situation
162        we expect the parent feature and the two children to all be marked as
163        strand == -1, and in the order 0:10 then 19:100.
164
165        Also need to consider dual-strand examples like these from the Arabidopsis
166        thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650)
167        gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057)
168        which is further complicated by a splice:
169        join(complement(69611..69724),139856..140087,140625..140650)
170
171        For mixed this mixed strand feature, the parent SeqFeature should have
172        no strand (either 0 or None) while the child features should have either
173        strand +1 or -1 as appropriate, and be listed in the order given here.
174         * @param feature
175         * @param record_length
176         */
177        private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
178                if(feature.getChildrenFeatures().isEmpty()) {                   
179                        if(feature.getLocations().getSubLocations().isEmpty()) {
180                                //Non-recursive.
181                                String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length);
182                                if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
183                                        StringBuilder sb = new StringBuilder();
184                                        Formatter formatter = new Formatter(sb,Locale.US);
185                                        formatter.format("complement(%s)", location);
186                                        String output = formatter.toString();
187                                        formatter.close();
188                                        location = output;
189                                }
190                                return location;        
191                                
192                        } else if (feature.getLocations().getStrand() == Strand.NEGATIVE) {
193                                
194                                // As noted above, treat reverse complement strand features carefully:
195
196                                // check if any of the sublocations strand differs from the parent features strand
197                                for(Location l  : feature.getLocations().getSubLocations()) {                                   
198                                        if (l.getStrand() != Strand.NEGATIVE) {                                         
199                                                StringBuilder sb = new StringBuilder();
200                                                Formatter formatter = new Formatter(sb, Locale.US);
201                                                formatter.format("Inconsistent strands: %s for parent, %s for child",
202                                                                feature.getLocations().getStrand(), l.getStrand());
203                                                String output = formatter.toString();
204                                                formatter.close();
205                                                throw new RuntimeException(output);                                             
206                                        }                                       
207                                }
208                                
209                                StringBuilder sb = new StringBuilder();
210                                Formatter formatter = new Formatter(sb, Locale.US);
211                                ArrayList<String> locations = new ArrayList<String>();
212                                for(Location l  : feature.getLocations().getSubLocations()) {   
213                                        locations.add(_insdc_location_string_ignoring_strand_and_subfeatures((AbstractLocation) l, record_length));                                     
214                                }
215                                String location = StringManipulationHelper.join(locations, ",");
216                                formatter.format("complement(%s(%s))", /* feature.location_operator */ "join", location);
217                                String output = formatter.toString();
218                                formatter.close();
219                                return output;
220
221                        } else {
222                                //Convert feature sub-locations into joins
223                                //This covers typical forward strand features, and also an evil mixed strand:
224                                StringBuilder sb = new StringBuilder();
225                                Formatter formatter = new Formatter(sb,Locale.US);
226                                ArrayList<String> locations = new ArrayList<String>();
227                                for(Location l  : feature.getLocations().getSubLocations()) {   
228                                        locations.add(_insdc_location_string_ignoring_strand_and_subfeatures((AbstractLocation) l, record_length));                                     
229                                }
230                                String location =  StringManipulationHelper.join(locations, ",");
231                                formatter.format("%s(%s)", /*feature.location_operator*/ "join", location);
232                                String output = formatter.toString();
233                                formatter.close();
234                                return output;
235                        }                       
236                }
237                // As noted above, treat reverse complement strand features carefully:
238                if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
239                        for(FeatureInterface<?, ?> f  : feature.getChildrenFeatures()) {
240                                if(f.getLocations().getStrand() != Strand.NEGATIVE) {
241                                        StringBuilder sb = new StringBuilder();
242                                        Formatter formatter = new Formatter(sb,Locale.US);
243                                        formatter.format("Inconsistent strands: %s for parent, %s for child", feature.getLocations().getStrand(), f.getLocations().getStrand());
244                                        String output = formatter.toString();
245                                        formatter.close();
246                                        throw new RuntimeException(output);
247                                }
248                        }
249                        StringBuilder sb = new StringBuilder();
250                        Formatter formatter = new Formatter(sb,Locale.US);
251                        ArrayList<String> locations = new ArrayList<String>();
252                        for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
253                                locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
254                        }
255                        String location = StringManipulationHelper.join(locations, ",");
256                        formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location);
257                        String output = formatter.toString();
258                        formatter.close();
259                        return output;
260                }
261                //This covers typical forward strand features, and also an evil mixed strand:
262                StringBuilder sb = new StringBuilder();
263                Formatter formatter = new Formatter(sb,Locale.US);
264                ArrayList<String> locations = new ArrayList<String>();
265                for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
266                        locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
267                }
268                String location =  StringManipulationHelper.join(locations, ",");
269                formatter.format("%s(%s)", /*feature.location_operator*/ "join", location);
270                String output = formatter.toString();
271                formatter.close();
272                return output;
273        }
274
275        private String _insdc_location_string_ignoring_strand_and_subfeatures(
276                        //SequenceLocation<AbstractSequence<C>, C> sequenceLocation,
277                                                AbstractLocation sequenceLocation,
278                        int record_length) {
279        /*
280        if location.ref:
281                ref = "%s:" % location.ref
282        else:
283                ref = ""
284        assert not location.ref_db
285        */
286                String ref = "";
287                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) {
288                        //Special case, for 12:12 return 12^13
289                        //(a zero length slice, meaning the point between two letters)
290                        if(sequenceLocation.getEnd().getPosition() == record_length) {
291                                //Very special case, for a between position at the end of a
292                                //sequence (used on some circular genomes, Bug 3098) we have
293                                //N:N so return N^1
294                                StringBuilder sb = new StringBuilder();
295                                Formatter formatter = new Formatter(sb,Locale.US);
296                                formatter.format("%s%d^1", ref, record_length);
297                                String output = formatter.toString();
298                                formatter.close();
299                                return output;
300                        } else {
301                                StringBuilder sb = new StringBuilder();
302                                Formatter formatter = new Formatter(sb,Locale.US);
303                                formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition());
304                                String output = formatter.toString();
305                                formatter.close();
306                                return output;
307                        }
308                }
309                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) {
310                        //Special case, for 11:12 return 12 rather than 12..12
311                        //(a length one slice, meaning a single letter)
312                        StringBuilder sb = new StringBuilder();
313                        Formatter formatter = new Formatter(sb,Locale.US);
314                        formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition());
315                        String output = formatter.toString();
316                        formatter.close();
317                        return output;
318                } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) {
319                        //Special case for features from SwissProt/UniProt files
320                        if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) {
321                                throw new RuntimeException("Feature with unknown location");
322                        } else if(sequenceLocation.getStart().isUnknown()) {
323                                //Treat the unknown start position as a BeforePosition
324                                StringBuilder sb = new StringBuilder();
325                                Formatter formatter = new Formatter(sb,Locale.US);
326                                formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd()));
327                                String output = formatter.toString();
328                                formatter.close();
329                                return output;
330                        } else {
331                                //Treat the unknown start position as an AfterPosition
332                                StringBuilder sb = new StringBuilder();
333                                Formatter formatter = new Formatter(sb,Locale.US);
334                                formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition());
335                                String output = formatter.toString();
336                                formatter.close();
337                                return output;
338                        }
339                } else {
340                        //Typical case, e.g. 12..15 gets mapped to 11:15
341                        String start = _insdc_feature_position_string(sequenceLocation.getStart());
342                        String end = _insdc_feature_position_string(sequenceLocation.getEnd()); 
343                        
344                        if (sequenceLocation.isPartial()) {
345                                if (sequenceLocation.isPartialOn5prime()) {
346                                        start = "<" + start;
347                                }
348                                
349                                if (sequenceLocation.isPartialOn3prime()) {
350                                        end = ">" + end;
351                                }       
352                        }
353                        
354                        return ref + start + ".." + end;
355                }
356        }
357        private String _insdc_feature_position_string(Point location) {
358                // TODO Auto-generated method stub
359                return _insdc_feature_position_string(location, 0);
360        }
361
362        /**
363         * Build a GenBank/EMBL position string (PRIVATE).
364         * @param location
365         * @param increment
366         */
367        private String _insdc_feature_position_string(Point location, int increment) {
368                        StringBuilder sb = new StringBuilder();
369                        Formatter formatter = new Formatter(sb,Locale.US);
370                        formatter.format("%s", location.getPosition() + increment);
371                        String output = formatter.toString();
372                        formatter.close();
373                        return output;
374
375        /*
376        if isinstance(pos, SeqFeature.ExactPosition):
377                return "%i" % (pos.position+offset)
378        elif isinstance(pos, SeqFeature.WithinPosition):
379                return "(%i.%i)" % (pos.position + offset,
380                                                        pos.position + pos.extension + offset)
381        elif isinstance(pos, SeqFeature.BetweenPosition):
382                return "(%i^%i)" % (pos.position + offset,
383                                                        pos.position + pos.extension + offset)
384        elif isinstance(pos, SeqFeature.BeforePosition):
385                return "<%i" % (pos.position + offset)
386        elif isinstance(pos, SeqFeature.AfterPosition):
387                return ">%i" % (pos.position + offset)
388        elif isinstance(pos, SeqFeature.OneOfPosition):
389                return "one-of(%s)" \
390                           % ",".join([_insdc_feature_position_string(p,offset) \
391                                                   for p in pos.position_choices])
392        elif isinstance(pos, SeqFeature.AbstractPosition):
393                raise NotImplementedError("Please report this as a bug in Biopython.")
394        else:
395                raise ValueError("Expected a SeqFeature position object.")
396                 */
397        }
398
399        /**
400         * Returns a list of strings.
401         *
402         *   Any single words which are too long get returned as a whole line
403         *   (e.g. URLs) without an exception or warning.
404         * @param text
405         * @param max_len
406         */
407        protected ArrayList<String> _split_multi_line(String text, int max_len) {
408                // TODO Auto-generated method stub
409                ArrayList<String> output = new ArrayList<String>();
410                text = text.trim();
411                if(text.length() <= max_len) {
412                        output.add(text);
413                        return output;
414                }
415
416                ArrayList<String> words = new ArrayList<String>();
417                Collections.addAll(words, text.split("\\s+"));
418                while(!words.isEmpty()) {
419                        text = words.remove(0);
420                        while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) {
421                                text += " " + words.remove(0);
422                                text = text.trim();
423                        }
424                        output.add(text);
425                }
426                assert words.isEmpty();
427                return output;
428        }
429
430
431}