..
Lab Evaluation 1
Portions
PRACTICE SESSION : ( students individually to upload in AUMS on or before Sept 26,2023 by 9 am ) Consider a few word documents and do the following: ( using any languages of your choice )
- Stemming (Porter’s algorithm)
- Inverted index construction
- Spell correction using weighing distances / k-gram
- Boolean model query processing ( with or without skip pointers )
Lab evaluation 1- September- 29- Portions: Algorithms used in Chapter 1-3
from nltk.tokenize import word_tokenize
from dataclasses import dataclass, field
# Consider the documents given which is a collection of Shakespeare’s sonnets.
# Perform the following operations with respect to the document
# a. Tokenize the contents of doc1, doc 2 and doc3.
# b. Construct the inverted index.
# c. Retrieve the result for the query “beauty AND thine” using inverted index.
# d. If the query had a misspelled word “illage” .Use 3 gram spell correction and suggest
# the correct word for the misspelled word.
@dataclass
class inverted_index_item():
doc_freq: int = 0
posting_list: list[int] = field(default_factory=list)
def __repr__(self) -> str:
return f"{self.doc_freq}->{self.posting_list}"
def intersection(self, other):
p1 = set(self.posting_list)
p2 = set(other.posting_list)
tmp = inverted_index_item()
tmp.doc_freq = 0
tmp.posting_list = list(p1.intersection(p2))
return tmp
def union(self, other):
ret = []
p1 = self.posting_list
p2 = other.posting_list
p1_ptr = 0
p2_ptr = 0
# Check if we are actually inside the list
while p1_ptr < len(p1) and p2_ptr < len(p2):
if p1[p1_ptr] == p2[p2_ptr]:
ret.append(p1[p1_ptr])
# Increment smaller on until it becomes equal
elif p1[p1_ptr] < p2[p2_ptr]:
p1_ptr += 1
else:
p2_ptr += 1
return inverted_index_item(0, ret)
def inverted_index_create(corpus:list[list[str]])->dict[str, inverted_index_item]:
tmp = []
for i, doc in enumerate(corpus):
for term in doc:
tmp.append((term, i))
inverted_index: dict[str, inverted_index_item]
inverted_index = {}
tmp.sort()
for tup in tmp:
term, doc_id = tup
tmp2 = inverted_index.get(term, inverted_index_item())
# Only add if it is not present
if not tmp2.posting_list or tmp2.posting_list[-1] != doc_id:
tmp2.posting_list.append(doc_id)
tmp2.doc_freq += 1
inverted_index[term] = tmp2
return inverted_index
def corpus_read()->list[str]:
corpus = []
for i in range(1, 4):
with open(f"./doc{i}.txt", "r") as file:
text = file.read().lower()
corpus.append(text)
return corpus
def tokenize(corpus:list[str])->list[list[str]]:
ret:list[list[str]]
ret = []
for doc in corpus:
tmp = word_tokenize(doc)
ret.append(tmp)
return ret
def kgram_create(term:str, k:int)->list[str]:
return ["".join(j) for j in zip(*[term[i:] for i in range(k)])]
def kgram_num(term, k):
return len(term) + 1 - k
def jaccard_coefficient(query:str, term:str, k:int)->float:
kgram = kgram_create(query, k)
ins = 0
for q in kgram:
if q in term:
ins += 1
return ins/(kgram_num(term, k) + len(kgram) - ins)
def edit_distance(a:str, b:str):
mat = [[0 for _ in range(len(b)+1)] for _ in range(len(a)+1)]
mat[0][0] = 0;
for i in range(len(a)+1):
mat[i][0] = i
for i in range(len(b)+1):
mat[0][i] = i
for i in range(1, len(a)+1):
for j in range(1, len(b)+1):
mat[i][j] = min(
mat[i-1][j-1] + (0 if a[i-1] == b[j-1] else 1),
mat[i-1][j]+1,
mat[i][j-1]+1
)
return mat[-1][-1]
def candidate_vocab_get(query:str, vocab:list[str], k:int):
candidate_dict: dict[str, float]
candidate_dict = {}
for term in vocab:
candidate_dict[term] = jaccard_coefficient(query, term, k)
return dict(sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True))
def spell_check(query:str, vocab:list[str], k):
candidate_vocab = candidate_vocab_get(query, vocab, k)
for term in candidate_vocab:
candidate_vocab[term] = edit_distance(query, term)
candidate_vocab = dict(sorted(candidate_vocab.items(), key=lambda x: x[1]))
return candidate_vocab
def boolean_eval(query:str, inverted_index: dict[str, inverted_index_item]):
token_arr:list
token_arr = query.split()
for i, v in enumerate(token_arr):
if v not in ["AND", "OR", "NOT"]:
tmp = inverted_index.get(v)
if tmp:
token_arr[i] = tmp
else:
token_arr[i] = []
infix(token_arr)
return
def infix(expr):
expr = ["("] + expr + [")"]
stack = list()
num = list()
while len(expr) > 0:
c = expr.pop(0)
print(stack, num)
if c not in ["AND", "OR", "NOT"]:
num.append(c)
else:
if not num:
stack.append(num)
num = list()
if c in ["AND", "OR", "NOT"]:
stack.append(c)
elif c == ")":
num2 = stack.pop()
op = stack.pop()
num1 = stack.pop()
if op == "AND":
stack.append(num1.intersection(num2))
elif op == "OR":
stack.append(num1.union(num2))
return stack
def main():
corpus = corpus_read()
corpus = tokenize(corpus)
inverted_index = inverted_index_create(corpus)
vocab = list(inverted_index.keys())
for k,v in inverted_index.items():
print(k, v)
print(inverted_index["beauty"].intersection(inverted_index["thine"]).posting_list)
#boolean_eval("beauty AND thine", inverted_index)
print(spell_check("illage", vocab, 3))
return
if __name__ == "__main__":
main()