-
Notifications
You must be signed in to change notification settings - Fork 0
/
LSH.html
89 lines (85 loc) · 7.88 KB
/
LSH.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html><head><title>Python: module IR_A2</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head><body bgcolor="#f0f0f8">
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="heading">
<tr bgcolor="#7799ee">
<td valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"> <br><big><big><strong>IR_A2</strong></big></big></font></td
><td align=right valign=bottom
><font color="#ffffff" face="helvetica, arial"><a href=".">index</a><br><a href="file:c%3A%5Cusers%5Caditya%20agarwal%5C.spyder-py3%5Cir%5Cir_a2.py">c:\users\aditya agarwal\.spyder-py3\ir\ir_a2.py</a></font></td></tr></table>
<p><tt>Created on Sun Oct 25 13:21:05 2020<br>
<br>
@author: Aditya Agarwal</tt></p>
<p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#aa55cc">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Modules</strong></big></font></td></tr>
<tr><td bgcolor="#aa55cc"><tt> </tt></td><td> </td>
<td width="100%"><table width="100%" summary="list"><tr><td width="25%" valign=top><a href="PIL.Image.html">PIL.Image</a><br>
<a href="PIL.ImageTk.html">PIL.ImageTk</a><br>
</td><td width="25%" valign=top><a href="numpy.html">numpy</a><br>
<a href="os.html">os</a><br>
</td><td width="25%" valign=top><a href="pandas.html">pandas</a><br>
<a href="time.html">time</a><br>
</td><td width="25%" valign=top><a href="tkinter.html">tkinter</a><br>
</td></tr></table></td></tr></table><p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#eeaa77">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Functions</strong></big></font></td></tr>
<tr><td bgcolor="#eeaa77"><tt> </tt></td><td> </td>
<td width="100%"><dl><dt><a name="-MinHashingNumpy"><strong>MinHashingNumpy</strong></a>(df_shingles, num_hashes=100)</dt><dd><tt>This function generates random permutations of indices and then<br>
for each permutation it generates the signatures. Numpy is used<br>
to accelerate the process. The first occurence of a '1' for a<br>
shingle is used to generate the signature for a doc.</tt></dd></dl>
<dl><dt><a name="-comp"><strong>comp</strong></a>(item)</dt><dd><tt>helper function to compare similarity score for sorting docs</tt></dd></dl>
<dl><dt><a name="-doLSH"><strong>doLSH</strong></a>(df_signature, rows_band=5)</dt><dd><tt>Actual LSH starts here using the signature matrix. Documents in the same band that are similar <br>
have higher probability of landing in the same bucket. Returns the buckets list which contains<br>
buckets for documents.</tt></dd></dl>
<dl><dt><a name="-doShingling"><strong>doShingling</strong></a>(direc, length_shingle=5)</dt><dd><tt>Performs shingling and builds incidence matrix for shingles</tt></dd></dl>
<dl><dt><a name="-getDir"><strong>getDir</strong></a>(direc)</dt><dd><tt>Get directory input and start processing the documents.</tt></dd></dl>
<dl><dt><a name="-getQuery"><strong>getQuery</strong></a>(query)</dt><dd><tt>Get query sequence and process the query.</tt></dd></dl>
<dl><dt><a name="-getSimDocs"><strong>getSimDocs</strong></a>(cur_doc, buckets_dict, df_signature, rows_band)</dt><dd><tt>Returns documents that are in the same bucket as the current document/query</tt></dd></dl>
<dl><dt><a name="-get_fileslist"><strong>get_fileslist</strong></a>(destination_folder)</dt><dd><tt>Returns list of files</tt></dd></dl>
<dl><dt><a name="-hashBand"><strong>hashBand</strong></a>(cur_band, buckets_list)</dt><dd><tt>Function to hash one band of the document given as input to one of the buckets<br>
in bucket list given as input.</tt></dd></dl>
<dl><dt><a name="-hashQueryBand"><strong>hashQueryBand</strong></a>(cur_band)</dt><dd><tt>helper function to hash one band of the document given as input</tt></dd></dl>
<dl><dt><a name="-matrix_create"><strong>matrix_create</strong></a>(lists, z=5)</dt><dd><tt>Generates incidence-matrix for K-grams(Shingles)</tt></dd></dl>
<dl><dt><a name="-setBands"><strong>setBands</strong></a>(bns)</dt><dd><tt>helper function to set number of bands</tt></dd></dl>
<dl><dt><a name="-setPerms"><strong>setPerms</strong></a>(nps)</dt><dd><tt>helper function to set number of permutations</tt></dd></dl>
<dl><dt><a name="-setShingles"><strong>setShingles</strong></a>(ss)</dt><dd><tt>helper function to set size of shingles</tt></dd></dl>
<dl><dt><a name="-similarity_Cosine"><strong>similarity_Cosine</strong></a>(query, simDocs, df_signature)</dt><dd><tt>This function finds cosine similarity between two documents</tt></dd></dl>
<dl><dt><a name="-similarity_J"><strong>similarity_J</strong></a>(query, simDocs, df_signature)</dt><dd><tt>This function finds jaccard similarity between two documents</tt></dd></dl>
</td></tr></table><p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#55aa55">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Data</strong></big></font></td></tr>
<tr><td bgcolor="#55aa55"><tt> </tt></td><td> </td>
<td width="100%"><strong>HEIGHT</strong> = 800<br>
<strong>WIDTH</strong> = 800<br>
<strong>background_label</strong> = <tkinter.Label object .!label><br>
<strong>button</strong> = <tkinter.Button object .!frame.!button><br>
<strong>button2</strong> = <tkinter.Button object .!frame2.!button><br>
<strong>button3</strong> = <tkinter.Button object .!frame3.!button><br>
<strong>button4</strong> = <tkinter.Button object .!frame4.!button><br>
<strong>button5</strong> = <tkinter.Button object .!frame5.!button><br>
<strong>canvas</strong> = <tkinter.Canvas object .!canvas><br>
<strong>entry</strong> = <tkinter.Entry object .!frame.!entry><br>
<strong>entry2</strong> = <tkinter.Entry object .!frame2.!entry><br>
<strong>entry3</strong> = <tkinter.Entry object .!frame3.!entry><br>
<strong>entry4</strong> = <tkinter.Entry object .!frame4.!entry><br>
<strong>entry5</strong> = <tkinter.Entry object .!frame5.!entry><br>
<strong>frame</strong> = <tkinter.Frame object .!frame><br>
<strong>frame2</strong> = <tkinter.Frame object .!frame2><br>
<strong>frame3</strong> = <tkinter.Frame object .!frame3><br>
<strong>frame4</strong> = <tkinter.Frame object .!frame4><br>
<strong>frame5</strong> = <tkinter.Frame object .!frame5><br>
<strong>imagex</strong> = <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1400x787><br>
<strong>label</strong> = <tkinter.Label object .!frame6.!label><br>
<strong>lower_frame</strong> = <tkinter.Frame object .!frame6><br>
<strong>photo</strong> = <PIL.ImageTk.PhotoImage object><br>
<strong>root</strong> = <tkinter.Tk object .></td></tr></table>
</body></html>