import math
import os

# inPath = "bpi_data"
inPath = "/home/joerg/PycharmProjects/TFStart/corpus/masc_500k_texts"
ending = ".txt"
outName = "MutualInformationMASC.csv"
outFile = open(outName, "w")
outFile.write("Dateset, Dist, MI, MInorm \n")

dists = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]

for root, dirs, fnames in os.walk(inPath):

#for inName in ['BPIC_Challenge_2012.extract.txt',
#               'BPIC_Challenge_2012.extract.complete.txt',
#               'BPIC_Challenge_2012.extract.with.resource.txt',
#               'BPIC_Challenge_2012.extract.complete.with.resource.txt',
#               'BPI_Challenge_2013_closed_problems.extract.txt',
#               'BPI_Challenge_2013_incidents.extract.txt',
#               'BPI_Challenge_2013_closed_problems.extract.with.group.txt',
#               'BPI_Challenge_2013_incidents.extract.with.group.txt',
#               ]:

    for fname in fnames:
        if fname.endswith(ending):
            inName = os.path.join(root, fname)
            print(inName)
            inFile = open(inName, "r")
            inText = inFile.read().replace("\n", " ").split()

            for dist in dists:
                l = len(inText)
                if (dist < l):
#                    print(dist)
                    Mfreqs = dict()
                    Ifreqs = dict()
                    for i in range(0, dist):
                        w1 = inText[i]
                        if w1 in Ifreqs:
                            Ifreqs[w1]+=1
                        else:
                            Ifreqs[w1]=1
                    for i in range(dist, l):
                        w1 = inText[i]
                        w2 = inText[i-dist]
                        if (w1, w2) in Mfreqs:
                            Mfreqs[(w1, w2)]+=1
                        else:
                            Mfreqs[(w1, w2)]=1
                        if w1 in Ifreqs:
                            Ifreqs[w1]+=1
                        else:
                            Ifreqs[w1]=1

                    l = float(l)
                    d = float(dist)

                    h = 0
                    for (w, c) in Ifreqs.items():
                        c = float(c)
                        h -= (c/l) * math.log( (c/l), 2)
                    mi = 0
                    for ((w1, w2), c) in Mfreqs.items():
                        c = float(c)
                        mi += c/(l-d) * math.log((c/(l-d)) / ( (Ifreqs[w1]/l) * (Ifreqs[w2]/l)), 2)

                    minorm = mi/h
                    outFile.write(inName + ", {0}, {1}, {2}\n".format(dist, mi, minorm))
                    os.fsync(outFile.fileno())

outFile.close()
