001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021/**
022 *
023 */
024package org.biojava.nbio.core.sequence.io;
025
026import org.biojava.nbio.core.sequence.Strand;
027import org.biojava.nbio.core.sequence.features.FeatureInterface;
028import org.biojava.nbio.core.sequence.features.Qualifier;
029import org.biojava.nbio.core.sequence.location.template.Point;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.biojava.nbio.core.sequence.template.Compound;
032import org.biojava.nbio.core.util.StringManipulationHelper;
033
034import java.util.ArrayList;
035import java.util.Collections;
036import java.util.Formatter;
037import java.util.List;
038import java.util.Locale;
039
040/**
041 * @author mckeee1
042 *
043 */
044public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> {
045        protected static final int MAX_WIDTH = 80;
046        protected static final int QUALIFIER_INDENT = 21;
047        protected static final String QUALIFIER_INDENT_STR = "                     ";
048        protected static final String QUALIFIER_INDENT_TMP = "     %s                ";
049        private static final String lineSep = "%n";
050
051        /**
052         * Format a feature qualifier using the MAX_WIDTH (default 80)
053         * @param key
054         * @param value
055         * @param quote
056         */
057        private String _write_feature_qualifier(String key, String value, boolean quote) {
058                String line = "";
059                if(null == value) {
060                        line = QUALIFIER_INDENT_STR + "/" + key + lineSep;
061                        return line;
062                }
063                if(quote) {  // quote should be true for numerics
064                        line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\"";
065                } else {
066                        line = QUALIFIER_INDENT_STR + "/" + key + "=" + value;
067                }
068                if(line.length() <= MAX_WIDTH) {
069                        return line + lineSep;
070                }
071                String goodlines = "";
072                while(!"".equals(line.replaceAll("^\\s+", ""))) {
073                        if(line.length() <= MAX_WIDTH) {
074                                goodlines += line + lineSep;
075                                break;
076                        }
077                        //Insert line break...
078                        int index;
079                        for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) {
080                                if(' ' == line.charAt(index)) {
081                                        break;
082                                }
083                        }
084                        if(' ' != line.charAt(index)) {
085                                //no nice place to break...
086                                index = MAX_WIDTH;
087                        }
088                        assert index <= MAX_WIDTH;
089                        goodlines += line.substring(0,index) + lineSep;
090                        line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", "");
091                }
092                return goodlines;
093        }
094        /**
095         * Split a feature location into lines (break at commas).
096         * @param location
097         */
098        private String _wrap_location(String location) {
099                int length = MAX_WIDTH - QUALIFIER_INDENT;
100                if(location.length() <= length) {
101                        return location;
102                }
103                int index = location.substring(length).lastIndexOf(",");
104                if(-1 == index) {
105                        //No good place to split (!)
106                        return location;
107                }
108                return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1));
109        }
110        /**
111         * Write a single SeqFeature object to features table.
112         * @param feature
113         * @param record_length
114         */
115        protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
116                String location = _insdc_feature_location_string(feature, record_length);
117                String f_type = feature.getType().replace(" ", "_");
118                StringBuilder sb = new StringBuilder();
119                Formatter formatter = new Formatter(sb,Locale.US);
120                formatter.format(QUALIFIER_INDENT_TMP, f_type);
121                String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep;
122                formatter.close();
123
124                //Now the qualifiers...
125                for(List<Qualifier>  qualifiers : feature.getQualifiers().values()) {
126                        for(Qualifier q : qualifiers){
127                                line += _write_feature_qualifier(q.getName(), q.getValue(), q.needsQuotes());
128                        }
129                }
130                return line;
131                /*
132                self.handle.write(line)
133                #Now the qualifiers...
134                for key, values in feature.qualifiers.items():
135                        if isinstance(values, list) or isinstance(values, tuple):
136                                for value in values:
137                                        self._write_feature_qualifier(key, value)
138                        elif values:
139                                #String, int, etc
140                                self._write_feature_qualifier(key, values)
141                        else:
142                                #e.g. a /psuedo entry
143                                self._write_feature_qualifier(key)
144                 */
145        }
146        /**
147         * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).
148
149        There is a choice of how to show joins on the reverse complement strand,
150        GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
151        "join(complement(20,100),complement(1,10))" instead (but appears to have
152        now adopted the GenBank convention). Notice that the order of the entries
153        is reversed! This function therefore uses the first form. In this situation
154        we expect the parent feature and the two children to all be marked as
155        strand == -1, and in the order 0:10 then 19:100.
156
157        Also need to consider dual-strand examples like these from the Arabidopsis
158        thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650)
159        gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057)
160        which is further complicated by a splice:
161        join(complement(69611..69724),139856..140087,140625..140650)
162
163        For mixed this mixed strand feature, the parent SeqFeature should have
164        no strand (either 0 or None) while the child features should have either
165        strand +1 or -1 as appropriate, and be listed in the order given here.
166         * @param feature
167         * @param record_length
168         */
169        private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
170                if(feature.getChildrenFeatures().isEmpty()) {
171                        //Non-recursive.
172                        String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length);
173                        if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
174                                StringBuilder sb = new StringBuilder();
175                                Formatter formatter = new Formatter(sb,Locale.US);
176                                formatter.format("complement(%s)", location);
177                                String output = formatter.toString();
178                                formatter.close();
179                                location = output;
180                        }
181                        return location;
182                }
183                // As noted above, treat reverse complement strand features carefully:
184                if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
185                        for(FeatureInterface<?, ?> f  : feature.getChildrenFeatures()) {
186                                if(f.getLocations().getStrand() != Strand.NEGATIVE) {
187                                        StringBuilder sb = new StringBuilder();
188                                        Formatter formatter = new Formatter(sb,Locale.US);
189                                        formatter.format("Inconsistent strands: %r for parent, %r for child", feature.getLocations().getStrand(), f.getLocations().getStrand());
190                                        String output = formatter.toString();
191                                        formatter.close();
192                                        throw new RuntimeException(output);
193                                }
194                        }
195                        StringBuilder sb = new StringBuilder();
196                        Formatter formatter = new Formatter(sb,Locale.US);
197                        ArrayList<String> locations = new ArrayList<String>();
198                        for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
199                                locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
200                        }
201                        String location = StringManipulationHelper.join(locations, ",");
202                        formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location);
203                        String output = formatter.toString();
204                        formatter.close();
205                        return output;
206                }
207                //This covers typical forward strand features, and also an evil mixed strand:
208                StringBuilder sb = new StringBuilder();
209                Formatter formatter = new Formatter(sb,Locale.US);
210                ArrayList<String> locations = new ArrayList<String>();
211                for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
212                        locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
213                }
214                String location =  StringManipulationHelper.join(locations, ",");
215                formatter.format("%s(%s)", /*feature.location_operator*/ "join", location);
216                String output = formatter.toString();
217                formatter.close();
218                return output;
219        }
220
221        private String _insdc_location_string_ignoring_strand_and_subfeatures(
222                        //SequenceLocation<AbstractSequence<C>, C> sequenceLocation,
223                                                org.biojava.nbio.core.sequence.location.template.AbstractLocation sequenceLocation,
224                        int record_length) {
225        /*
226        if location.ref:
227                ref = "%s:" % location.ref
228        else:
229                ref = ""
230        assert not location.ref_db
231        */
232                String ref = "";
233                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) {
234                        //Special case, for 12:12 return 12^13
235                        //(a zero length slice, meaning the point between two letters)
236                        if(sequenceLocation.getEnd().getPosition() == record_length) {
237                                //Very special case, for a between position at the end of a
238                                //sequence (used on some circular genomes, Bug 3098) we have
239                                //N:N so return N^1
240                                StringBuilder sb = new StringBuilder();
241                                Formatter formatter = new Formatter(sb,Locale.US);
242                                formatter.format("%s%d^1", ref, record_length);
243                                String output = formatter.toString();
244                                formatter.close();
245                                return output;
246                        } else {
247                                StringBuilder sb = new StringBuilder();
248                                Formatter formatter = new Formatter(sb,Locale.US);
249                                formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition());
250                                String output = formatter.toString();
251                                formatter.close();
252                                return output;
253                        }
254                }
255                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) {
256                        //Special case, for 11:12 return 12 rather than 12..12
257                        //(a length one slice, meaning a single letter)
258                        StringBuilder sb = new StringBuilder();
259                        Formatter formatter = new Formatter(sb,Locale.US);
260                        formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition());
261                        String output = formatter.toString();
262                        formatter.close();
263                        return output;
264                } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) {
265                        //Special case for features from SwissProt/UniProt files
266                        if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) {
267                                throw new RuntimeException("Feature with unknown location");
268                        } else if(sequenceLocation.getStart().isUnknown()) {
269                                //Treat the unknown start position as a BeforePosition
270                                StringBuilder sb = new StringBuilder();
271                                Formatter formatter = new Formatter(sb,Locale.US);
272                                formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd()));
273                                String output = formatter.toString();
274                                formatter.close();
275                                return output;
276                        } else {
277                                //Treat the unknown start position as an AfterPosition
278                                StringBuilder sb = new StringBuilder();
279                                Formatter formatter = new Formatter(sb,Locale.US);
280                                formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition());
281                                String output = formatter.toString();
282                                formatter.close();
283                                return output;
284                        }
285                } else {
286                        //Typical case, e.g. 12..15 gets mapped to 11:15
287                        return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 0) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd());
288                }
289        }
290        private String _insdc_feature_position_string(Point location) {
291                // TODO Auto-generated method stub
292                return _insdc_feature_position_string(location, 0);
293        }
294
295        /**
296         * Build a GenBank/EMBL position string (PRIVATE).
297         * @param location
298         * @param increment
299         */
300        private String _insdc_feature_position_string(Point location, int increment) {
301                        StringBuilder sb = new StringBuilder();
302                        Formatter formatter = new Formatter(sb,Locale.US);
303                        formatter.format("%s", location.getPosition() + increment);
304                        String output = formatter.toString();
305                        formatter.close();
306                        return output;
307
308        /*
309        if isinstance(pos, SeqFeature.ExactPosition):
310                return "%i" % (pos.position+offset)
311        elif isinstance(pos, SeqFeature.WithinPosition):
312                return "(%i.%i)" % (pos.position + offset,
313                                                        pos.position + pos.extension + offset)
314        elif isinstance(pos, SeqFeature.BetweenPosition):
315                return "(%i^%i)" % (pos.position + offset,
316                                                        pos.position + pos.extension + offset)
317        elif isinstance(pos, SeqFeature.BeforePosition):
318                return "<%i" % (pos.position + offset)
319        elif isinstance(pos, SeqFeature.AfterPosition):
320                return ">%i" % (pos.position + offset)
321        elif isinstance(pos, SeqFeature.OneOfPosition):
322                return "one-of(%s)" \
323                           % ",".join([_insdc_feature_position_string(p,offset) \
324                                                   for p in pos.position_choices])
325        elif isinstance(pos, SeqFeature.AbstractPosition):
326                raise NotImplementedError("Please report this as a bug in Biopython.")
327        else:
328                raise ValueError("Expected a SeqFeature position object.")
329                 */
330        }
331
332        /**
333         * Returns a list of strings.
334         *
335         *   Any single words which are too long get returned as a whole line
336         *   (e.g. URLs) without an exception or warning.
337         * @param text
338         * @param max_len
339         */
340        protected ArrayList<String> _split_multi_line(String text, int max_len) {
341                // TODO Auto-generated method stub
342                ArrayList<String> output = new ArrayList<String>();
343                text = text.trim();
344                if(text.length() <= max_len) {
345                        output.add(text);
346                        return output;
347                }
348
349                ArrayList<String> words = new ArrayList<String>();
350                Collections.addAll(words, text.split("\\s+"));
351                while(!words.isEmpty()) {
352                        text = words.remove(0);
353                        while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) {
354                                text += " " + words.remove(0);
355                                text = text.trim();
356                        }
357                        output.add(text);
358                }
359                assert words.isEmpty();
360                return output;
361        }
362
363
364}