summaryrefslogtreecommitdiff
path: root/examples/partial_gene_match.py
blob: e19cf8b6a5f49ff9ba6e5462887a24626b8f6eca (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# parital_gene_match.py
#
#  Example showing how to create a customized pyparsing Token, in this case,
#  one that is very much like Literal, but which tolerates up to 'n' character
#  mismatches
from pyparsing import *

import urllib.request, urllib.parse, urllib.error

# read in a bunch of genomic data
datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta")
fastasrc = datafile.read()
datafile.close()

"""
Sample header:
>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
"""
integer = Word(nums).setParseAction(lambda t:int(t[0]))
genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
                Word(printables)("id") + SkipTo("length=", include=True) +
                integer("genelen") + LineEnd() +
                Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))

# read gene data from .fasta file - takes just a few seconds
genedata = OneOrMore(genebit).parseString(fastasrc)


class CloseMatch(Token):
    """A special subclass of Token that does *close* matches. For each
       close match of the given string, a tuple is returned giving the
       found close match, and a list of mismatch positions."""
    def __init__(self, seq, maxMismatches=1):
        super(CloseMatch,self).__init__()
        self.name = seq
        self.sequence = seq
        self.maxMismatches = maxMismatches
        self.errmsg = "Expected " + self.sequence
        self.mayIndexError = False
        self.mayReturnEmpty = False

    def parseImpl( self, instring, loc, doActions=True ):
        start = loc
        instrlen = len(instring)
        maxloc = start + len(self.sequence)

        if maxloc <= instrlen:
            seq = self.sequence
            seqloc = 0
            mismatches = []
            throwException = False
            done = False
            while loc < maxloc and not done:
                if instring[loc] != seq[seqloc]:
                    mismatches.append(seqloc)
                    if len(mismatches) > self.maxMismatches:
                        throwException = True
                        done = True
                loc += 1
                seqloc += 1
        else:
            throwException = True

        if throwException:
            exc = self.myException
            exc.loc = loc
            exc.pstr = instring
            raise exc

        return loc, (instring[start:loc],mismatches)

# using the genedata extracted above, look for close matches of a gene sequence
searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
for g in genedata:
    print("%s (%d)" % (g.id, g.genelen))
    print("-"*24)
    for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
        matched, mismatches = t[0]
        print("MATCH:", searchseq.sequence)
        print("FOUND:", matched)
        if mismatches:
            print("      ", ''.join(' ' if i not in mismatches else '*'
                            for i,c in enumerate(searchseq.sequence)))
        else:
            print("<exact match>")
        print("at location", startLoc)
        print()
    print()