<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">#!/usr/bin/env perl
#
# generate hints from an Exonerate gff dump
# Mario Stanke June, 2007

use strict;
use Getopt::Long;

my $usage = "$0 -- convert exonerate output file to hints file for AUGUSTUS\n";
$usage .= "\n";
$usage .= "Usage: $0 --in=exfile --out=hintsfile\n";
$usage .= "  Exonerate run like this: exonerate --model protein2genome --showtargetgff T ... &gt; exfile\n";
$usage .= "  options:\n";
$usage .= "  --priority=n       priority of hint group (default 4)\n";
$usage .= "  --minintronlen=n   alignments with gaps shorter than this and longer than maxgaplen are discarded (default 41)\n";
$usage .= "  --maxintronlen=n   alignments with longer gaps are discarded (default 350000\n";
$usage .= "  --CDSpart_cutoff=n this many bp are cut off of each CDSpart hint w.r.t. the exonerate cds (default 9)\n";
$usage .= "  --source=s         source identifier (default 'P')\n";
#$usage .= "  --ssOn             include splice site (dss, ass) hints in output (default false)\n";

my $exfile;
my $hintsfilename;
my $minintronlen = 41;
my $maxintronlen = 350000;
my $CDSpart_cutoff = 15;
my $source="XNT";
my $priority = 4;
my $prgsrc = "xnt2h";
my $line=0;
my $coloffset=0;
my $ssOn=0;
my $CDSpartid = "CDSpart"; # abbreviate to decrease file size
my $prot="";

if ($#ARGV &lt; 1 ) {
    print "$usage";
    exit;
}

GetOptions(
	   'in=s'=&gt;\$exfile,
	   'out=s'=&gt;\$hintsfilename,
	   'minintronlen:i'=&gt;\$minintronlen,
	   'maxintronlen:i'=&gt;\$maxintronlen,
	   'CDSpart_cutoff:i'=&gt;\$CDSpart_cutoff,
	   'source:s'=&gt;\$source,
	   'priority:i'=&gt;\$priority,
	   'ssOn!'=&gt;\$ssOn);

open(XNT, "&lt;$exfile") || die "Couldn't open $exfile\n";
open(HINTS, "&gt;$hintsfilename") || die "Could not open $hintsfilename";


while (&lt;XNT&gt;) {
    s/#.*//;
    next unless /\S/;
    next unless /\texonerate:protein2genome:local\t/; # modified by Katharina on September 8th 2010 because exonerate output format apparently changed regarding the string "local".q
   # print "I am in the file!\n";
    my @f = split /\t/, $_, 9;
    if (@f &lt; 8) { warn "Not gff format"; next }
    my $seqname = $f[0];
    my $type = $f[2];
    my $start = $f[3];
    my $end = $f[4];
    my $score = $f[5];
    my $strand = $f[6];
#    if ($strand eq '-') {
#	$start++; # exonerate has a weird coordinate system
#	$end++;
#    } # apparently this has changed with exonerate versions and it is now standard
    if ($end &lt; $start) {
	my $tmp = $start;
	$start = $end;
	$end = $tmp;
    }
    if ($type eq "intron") {
	# length in range?
	if ($end - $start + 1 &gt;= $minintronlen &amp;&amp; $end - $start + 1 &lt;= $maxintronlen){
	    print HINTS "$seqname\t$prgsrc\tintron\t$start\t$end\t$score\t$strand\t.\tsrc=$source;grp=$prot;pri=$priority\n";
	}
    } elsif ($type eq "cds") {
	$start += $CDSpart_cutoff;
	$end -= $CDSpart_cutoff;
	if ($start &gt; $end) {
	    $start = $end = int(($start+$end)/2);
	}	
	print HINTS "$seqname\t$prgsrc\t$CDSpartid\t$start\t$end\t$score\t$strand\t.\tsrc=$source;grp=$prot;pri=$priority\n";
    } elsif ($type eq "gene") {
	/sequence (\S+) ; /;
	$prot = $1;
    }
}
</pre></body></html>