2023-09-29
Lab Evaluation 1

Portions

PRACTICE SESSION : ( students individually to upload in AUMS on or before Sept 26,2023 by 9 am ) Consider a few word documents and do the following: ( using any languages of your choice )
Stemming (Porter’s algorithm)
Inverted index construction
Spell correction using weighing distances / k-gram
Boolean model query processing ( with or without skip pointers )
Lab evaluation 1- September- 29- Portions: Algorithms used in Chapter 1-3
from nltk.tokenize import word_tokenize
from dataclasses import dataclass, field

# Consider the documents given which is a collection of Shakespeare’s sonnets.
# Perform the following operations with respect to the document
# a. Tokenize the contents of doc1, doc 2 and doc3.
# b. Construct the inverted index.
# c. Retrieve the result for the query “beauty AND thine” using inverted index.
# d. If the query had a misspelled word “illage” .Use 3 gram spell correction and suggest
# the correct word for the misspelled word.


@dataclass
class inverted_index_item():
    doc_freq: int = 0
    posting_list: list[int] = field(default_factory=list)
    def __repr__(self) -> str:
        return f"{self.doc_freq}->{self.posting_list}"
    def intersection(self, other):
        p1 = set(self.posting_list)
        p2 = set(other.posting_list)
        tmp = inverted_index_item()
        tmp.doc_freq = 0
        tmp.posting_list = list(p1.intersection(p2))
        return tmp

    def union(self, other):
        ret = []
        p1 = self.posting_list
        p2 = other.posting_list
        p1_ptr = 0
        p2_ptr = 0
        # Check if we are actually inside the list
        while p1_ptr < len(p1) and p2_ptr < len(p2):
            if p1[p1_ptr] == p2[p2_ptr]:
                ret.append(p1[p1_ptr])
        # Increment smaller on until it becomes equal
            elif p1[p1_ptr] < p2[p2_ptr]:
                p1_ptr += 1
            else:
                p2_ptr += 1
        return inverted_index_item(0, ret)

def inverted_index_create(corpus:list[list[str]])->dict[str, inverted_index_item]:
    tmp = []
    for i, doc in enumerate(corpus):
        for term in doc:
            tmp.append((term, i))

    inverted_index: dict[str, inverted_index_item]
    inverted_index = {}

    tmp.sort()

    for tup in tmp:
        term, doc_id = tup
        tmp2 = inverted_index.get(term, inverted_index_item())
        # Only add if it is not present
        if not tmp2.posting_list or tmp2.posting_list[-1] != doc_id:
            tmp2.posting_list.append(doc_id)
            tmp2.doc_freq += 1
        inverted_index[term] = tmp2

    return inverted_index

def corpus_read()->list[str]:
    corpus = []
    for i in range(1, 4):
        with open(f"./doc{i}.txt", "r") as file:
            text = file.read().lower()
            corpus.append(text)
    return corpus

def tokenize(corpus:list[str])->list[list[str]]:
    ret:list[list[str]]
    ret = []
    for doc in corpus:
            tmp = word_tokenize(doc)
            ret.append(tmp)
    return ret 

def kgram_create(term:str, k:int)->list[str]:
    return ["".join(j) for j in zip(*[term[i:] for i in range(k)])]

def kgram_num(term, k):
    return len(term) + 1 - k

def jaccard_coefficient(query:str, term:str, k:int)->float:
    kgram = kgram_create(query, k)
    ins = 0
    for q in kgram:
        if q in term:
            ins += 1
    return ins/(kgram_num(term, k) + len(kgram) - ins)

def edit_distance(a:str, b:str):
    mat = [[0 for _ in range(len(b)+1)] for _ in range(len(a)+1)]
    mat[0][0] = 0;
    for i in range(len(a)+1):
        mat[i][0] = i

    for i in range(len(b)+1):
        mat[0][i] = i

    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
            mat[i][j] = min(
                    mat[i-1][j-1] + (0 if a[i-1] == b[j-1]  else 1),
                    mat[i-1][j]+1,
                    mat[i][j-1]+1
                    )
    return mat[-1][-1]

def candidate_vocab_get(query:str, vocab:list[str], k:int):
    candidate_dict: dict[str, float]
    candidate_dict = {}
    for term in vocab:
        candidate_dict[term] = jaccard_coefficient(query, term, k)
    return dict(sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True))

def spell_check(query:str, vocab:list[str], k):
    candidate_vocab = candidate_vocab_get(query, vocab, k)
    for term in candidate_vocab:
        candidate_vocab[term] = edit_distance(query, term)
    candidate_vocab = dict(sorted(candidate_vocab.items(), key=lambda x: x[1]))
    return candidate_vocab

def boolean_eval(query:str, inverted_index: dict[str, inverted_index_item]):
    token_arr:list
    token_arr = query.split()
    for i, v in enumerate(token_arr):
        if v not in ["AND", "OR", "NOT"]:
            tmp = inverted_index.get(v)
            if tmp:
                token_arr[i] = tmp
            else:
                token_arr[i] = []
    infix(token_arr)
    return


def infix(expr):
    expr = ["("] + expr + [")"]
    stack = list()
    num = list()
    while len(expr) > 0:
        c = expr.pop(0)
        print(stack, num)
        if c not in ["AND", "OR", "NOT"]:
            num.append(c)
        else:
            if not num:
                stack.append(num)
                num = list()
            if c in ["AND", "OR", "NOT"]:
                stack.append(c)
            elif c == ")":
                num2 = stack.pop()
                op = stack.pop()
                num1 = stack.pop()
                if op == "AND":
                    stack.append(num1.intersection(num2))
                elif op == "OR":
                    stack.append(num1.union(num2))
    return stack

def main():
    corpus = corpus_read()
    corpus = tokenize(corpus)
    inverted_index = inverted_index_create(corpus)
    vocab = list(inverted_index.keys())
    for k,v in inverted_index.items():
        print(k, v)
    print(inverted_index["beauty"].intersection(inverted_index["thine"]).posting_list)
    #boolean_eval("beauty AND thine", inverted_index)
    print(spell_check("illage", vocab, 3))
    return


if __name__ == "__main__":
    main()