Commit 13ae1c62 by Yolanda Nainggolan

adding preprocessing page and editing dataframe page

parent 804be444
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n",
" - 12S16003 Maria H. Siallagan\n",
" - 12S16026 Yolanda Nainggolan\n",
" - 12S16036 Prima Hutapea\n",
" - 12S16049 Rosa Delima Mendrofa"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'dataset_STBI.xml'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-0410f424fcaa>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature_extraction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCountVectorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mminidom\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdcmnt_xml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"dataset_STBI.xml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\minidom.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, parser, bufsize)\u001b[0m\n\u001b[0;32m 1956\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbufsize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1957\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1958\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1959\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1960\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpulldom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\expatbuilder.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, namespaces)\u001b[0m\n\u001b[0;32m 908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 909\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 910\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 911\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparseFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 912\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'dataset_STBI.xml'"
]
}
],
"source": [
"import string\n",
"import re\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import xml.dom.minidom as minidom\n",
"dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n",
"all_profile = dcmnt_xml.getElementsByTagName('SONG')\n",
"all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n",
"all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n",
"all_pub = dcmnt_xml.getElementsByTagName('PUB')\n",
"all_page = dcmnt_xml.getElementsByTagName('PAGE')\n",
"\n",
"N_DOC_sample = len(all_doc_no)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"all_sentence_doc_sample = []\n",
"for i in range(N_DOC_sample):\n",
" sentence_doc_sample = ' '+ all_text[i].firstChild.data\n",
" all_sentence_doc_sample.append(sentence_doc_sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preprocessing "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tokens_doc = []"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def remove_punc_tokenize(sentence):\n",
" tokens = []\n",
" for punctuation in string.punctuation:\n",
" sentence = sentence.replace(punctuation,\" \")\n",
" \n",
" sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n",
" for w in CountVectorizer().build_tokenizer()(sentence):\n",
" tokens.append(w)\n",
" return tokens"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"stop_words = set(stopwords.words('english'))\n",
"def stop_word_token(tokens):\n",
" tokens = [w for w in tokens if not w in stop_words]\n",
" return tokens\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stop_word_token(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer\n",
"stemmer = PorterStemmer()\n",
"def stemming(tokens):\n",
" for i in range(0, len(tokens)):\n",
" if (tokens[i] != stemmer.stem(tokens[i])):\n",
" tokens[i] = stemmer.stem(tokens[i])\n",
" return tokens\n",
"\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stemming(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = []\n",
"for i in range(N_DOC):\n",
" for w in tokens_doc[i]:\n",
" all_tokens.append(w)\n",
"\n",
"new_sentence = ' '.join([w for w in all_tokens])\n",
"\n",
"for w in CountVectorizer().build_tokenizer()(new_sentence):\n",
" all_tokens.append(w)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = set(all_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from itertools import count\n",
"try: \n",
" from itertools import izip as zip\n",
"except ImportError:\n",
" pass\n",
"proximity_index = {}\n",
"for token in all_tokens:\n",
" dict_doc_position = {}\n",
" for n in range(N_DOC):\n",
" if(token in tokens_doc[n]):\n",
" dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n",
" proximity_index[token] = dict_doc_position"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n",
"for key, value in proximity_index.items():\n",
" print (key, value)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -23,7 +23,7 @@
<SONG> i cant get no satisfaction </SONG>
<ARTIST> the rolling stones </ARTIST>
<YEAR> 1965 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -1598,7 +1598,7 @@
<SONG> love is a hurtin thing </SONG>
<ARTIST> lou rawls </ARTIST>
<YEAR> 1966 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -1616,7 +1616,7 @@
<SONG> gloria </SONG>
<ARTIST> shadows of knight </ARTIST>
<YEAR> 1966 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -1778,7 +1778,7 @@
<SONG> zorba the greek </SONG>
<ARTIST> herb alpert and the tijuana brass </ARTIST>
<YEAR> 1966 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -1940,7 +1940,7 @@
<SONG> kind of a drag </SONG>
<ARTIST> the buckinghams </ARTIST>
<YEAR> 1967 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -2399,7 +2399,7 @@
<SONG> soul finger </SONG>
<ARTIST> the barkays </ARTIST>
<YEAR> 1967 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -3272,7 +3272,7 @@
<SONG> stay in my corner </SONG>
<ARTIST> the dells </ARTIST>
<YEAR> 1968 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -4217,7 +4217,7 @@
<SONG> twentyfive miles </SONG>
<ARTIST> edwin starr </ARTIST>
<YEAR> 1969 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -4325,7 +4325,7 @@
<SONG> tracy </SONG>
<ARTIST> the cuff links </ARTIST>
<YEAR> 1969 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -4415,7 +4415,7 @@
<SONG> your good thing is about to end </SONG>
<ARTIST> lou rawls </ARTIST>
<YEAR> 1969 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -4559,7 +4559,7 @@
<SONG> ill be there </SONG>
<ARTIST> the jackson 5 </ARTIST>
<YEAR> 1970 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -5234,7 +5234,7 @@
<SONG> the bells </SONG>
<ARTIST> the originals </ARTIST>
<YEAR> 1970 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -6233,7 +6233,7 @@
<SONG> dont knock my love </SONG>
<ARTIST> wilson pickett </ARTIST>
<YEAR> 1971 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -6368,7 +6368,7 @@
<SONG> baby dont get hooked on me </SONG>
<ARTIST> mac davis </ARTIST>
<YEAR> 1972 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -6755,7 +6755,7 @@
<SONG> jungle fever </SONG>
<ARTIST> the chakachas </ARTIST>
<YEAR> 1972 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -6935,7 +6935,7 @@
<SONG> joy </SONG>
<ARTIST> apollo 100 </ARTIST>
<YEAR> 1972 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -7511,7 +7511,7 @@
<SONG> keep on truckin </SONG>
<ARTIST> eddie kendricks </ARTIST>
<YEAR> 1973 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -7574,7 +7574,7 @@
<SONG> give me love give me peace on earth </SONG>
<ARTIST> george harrison </ARTIST>
<YEAR> 1973 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -8645,7 +8645,7 @@
<SONG> hang on in there baby </SONG>
<ARTIST> johnny bristol </ARTIST>
<YEAR> 1974 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -10157,7 +10157,7 @@
<SONG> theme from swat </SONG>
<ARTIST> rhythm heritage </ARTIST>
<YEAR> 1976 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -10283,7 +10283,7 @@
<SONG> theme from mahogany do you know where youre going to </SONG>
<ARTIST> diana ross </ARTIST>
<YEAR> 1976 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -11822,7 +11822,7 @@
<SONG> emotion </SONG>
<ARTIST> samantha sang </ARTIST>
<YEAR> 1978 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -11885,7 +11885,7 @@
<SONG> feels so good </SONG>
<ARTIST> chuck mangione </ARTIST>
<YEAR> 1978 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -12344,7 +12344,7 @@
<SONG> because the night </SONG>
<ARTIST> patti smith group </ARTIST>
<YEAR> 1978 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -12452,7 +12452,7 @@
<SONG> fool if you think its over </SONG>
<ARTIST> chris rea </ARTIST>
<YEAR> 1978 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -12965,7 +12965,7 @@
<SONG> lotta love </SONG>
<ARTIST> nicolette larson </ARTIST>
<YEAR> 1979 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -13316,7 +13316,7 @@
<SONG> rise </SONG>
<ARTIST> herb alpert </ARTIST>
<YEAR> 1979 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -13424,7 +13424,7 @@
<SONG> bad case of loving you doctor doctor </SONG>
<ARTIST> robert palmer </ARTIST>
<YEAR> 1979 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -13982,7 +13982,7 @@
<SONG> rise </SONG>
<ARTIST> herb alpert </ARTIST>
<YEAR> 1980 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -14504,7 +14504,7 @@
<SONG> morning train nine to five </SONG>
<ARTIST> sheena easton </ARTIST>
<YEAR> 1981 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -14666,7 +14666,7 @@
<SONG> the best of times </SONG>
<ARTIST> styx </ARTIST>
<YEAR> 1981 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -14801,7 +14801,7 @@
<SONG> how bout us </SONG>
<ARTIST> champaign </ARTIST>
<YEAR> 1981 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -15872,7 +15872,7 @@
<SONG> you could have been with me </SONG>
<ARTIST> sheena easton </ARTIST>
<YEAR> 1982 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -16025,7 +16025,7 @@
<SONG> i keep forgettin every time youre near </SONG>
<ARTIST> michael mcdonald </ARTIST>
<YEAR> 1982 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -17510,7 +17510,7 @@
<SONG> the warrior </SONG>
<ARTIST> scandal </ARTIST>
<YEAR> 1984 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -19976,7 +19976,7 @@
<SONG> at this moment </SONG>
<ARTIST> billy vera and the beaters </ARTIST>
<YEAR> 1987 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -20291,7 +20291,7 @@
<SONG> songbird </SONG>
<ARTIST> kenny g </ARTIST>
<YEAR> 1987 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -21164,7 +21164,7 @@
<SONG> whats on your mind pure energy </SONG>
<ARTIST> information society </ARTIST>
<YEAR> 1988 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -21308,7 +21308,7 @@
<SONG> rocket 2 u </SONG>
<ARTIST> the jets </ARTIST>
<YEAR> 1988 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -22820,7 +22820,7 @@
<SONG> i dont have the heart </SONG>
<ARTIST> james ingram </ARTIST>
<YEAR> 1990 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -23252,7 +23252,7 @@
<SONG> oh girl </SONG>
<ARTIST> paul young </ARTIST>
<YEAR> 1990 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -25853,7 +25853,7 @@
<SONG> forever in love </SONG>
<ARTIST> kenny g </ARTIST>
<YEAR> 1993 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -27374,7 +27374,7 @@
<SONG> ill be there for youyoure all i need to get by </SONG>
<ARTIST> method man featuring mary j blige </ARTIST>
<YEAR> 1995 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -27851,7 +27851,7 @@
<SONG> ill stand by you </SONG>
<ARTIST> the pretenders </ARTIST>
<YEAR> 1995 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -31685,7 +31685,7 @@
<SONG> bye bye bye </SONG>
<ARTIST> n sync </ARTIST>
<YEAR> 2000 </YEAR>
<LYRICS> </LYRICS>
<LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE>
</DOC>
<DOC>
......@@ -3,16 +3,14 @@ resource_package = __name__
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count
import collections
import math
import xml.etree.ElementTree as et
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
##############Remove Punctuation, URL and Tokenize###################
......@@ -50,20 +48,37 @@ def generate_ngrams(data, n):
return ngram, result
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
tokens = [w for w in tokens if not w in stop_words]
return tokens
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
for i in range(0, len(tokens)):
if (tokens[i] != stemmer.stem(tokens[i])):
tokens[i] = stemmer.stem(tokens[i])
return tokens
def main(query):
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_headline = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_headline.append(node.text)
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
......@@ -72,7 +87,7 @@ def main(query):
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_headline[i] + all_text[i])
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
......@@ -189,7 +204,7 @@ def main(query):
score*=idf[i] #tf * idf
idx = all_doc_no[i]
judul = all_headline[i]
judul = all_song[i]
dic['docno'] = idx
dic['judul'] = judul
......@@ -209,16 +224,16 @@ def detail(nomor):
tree.parse("apps/data/dataset_STBI.xml")
all_doc_no = []
all_headline = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
# all_headline.append(node.text.replace("\n"," "))
all_headline.append(node.text)
head = all_headline
# all_song.append(node.text.replace("\n"," "))
all_song.append(node.text)
head = all_song
for node in tree.iter("LYRICS"):
# all_text.append(node.text.replace("\n"," "))
......@@ -233,5 +248,5 @@ def detail(nomor):
check = all_doc_no[i]
if check == id:
text = all_text[i]
judul = all_headline[i]
judul = all_song[i]
return text,judul
\ No newline at end of file
......@@ -55,6 +55,15 @@ footer {
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: auto;
}
.carda {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: max-content;
}
.jumbotron {
......
......@@ -5,35 +5,103 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
white-space: nowrap;
}
#middleboxa{
float:left;
white-space: nowrap;
}
#rightbox{
float:right;
white-space: nowrap;
}
</style>
</head>
<body>
<main>
<div id="content">
<article class="card">
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
<center><h1>Dataset</h1><br>
<table style="width:100%">
<tr>
<th>DOCNO</th>
<th>SONG</th>
<th>ARTIST</th>
<th>LYRICS</th>
</tr>
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Dataset</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
{% for l in LYRICS %}
<tr>
<td>{{ i }}</td>
<td>{{ j }}</td>
<td>{{ k }}</td>
<td>{{ l }}</td>
</tr>
{% endfor %}
<div id = "leftbox">
<table>
<tr>
<th>DOCNO</th>
</tr>
{% for i in DOCNO %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table align="left">
<tr>
<th>SONG</th>
</tr>
{% for i in SONG %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>ARTIST</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>LYRICS</th>
</tr>
{% for i in LYRICS %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</table>
</center>
</article>
</article>
</div>
......@@ -46,9 +114,13 @@
</body>
<script>
function pageRedirect() {
window.location.href = "/preprocessing";
}
function pageRedirect_prev() {
window.location.href = "/home";
}
function pageRedirect_next() {
window.location.href = "/preprocessing";
}
</script>
</html>
......@@ -9,7 +9,12 @@
<body>
<main>
<div id="content">
<article class="card">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
</div>
<div class="row">
<center><h1 style="font-size:45px">Searching!<br></h1>
<p style="font-size:20px"><strong>Silahkan masukkan lirik dari lagu yang ingin Anda temukan</strong></p>
......@@ -27,10 +32,14 @@
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/indexing";
}
</script>
</html>
......@@ -11,25 +11,44 @@
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center>
<table style="width:100%">
<tr>
<th>Apa judulnya ya?</th>
</tr>
{% for i in indexnya %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect() {
window.location.href = "/index";
}
function pageRedirect_prev() {
window.location.href = "/preprocessing4";
}
function pageRedirect_next() {
window.location.href = "/index";
}
</script>
</html>
......@@ -11,25 +11,45 @@
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
<center><h1>Text Preprocessing</h1><br></center>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 1</strong></p>
<p><strong>After Punctuation Removal and Tokenization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect() {
window.location.href = "/indexing";
}
function pageRedirect_prev() {
window.location.href = "/dataframe";
}
function pageRedirect_next() {
window.location.href = "/preprocessing2";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 2</strong></p>
<p><strong>After Case Folding</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing";
}
function pageRedirect_next() {
window.location.href = "/preprocessing3";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 3</strong></p>
<p><strong>After Stopwords Removal</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing2";
}
function pageRedirect_next() {
window.location.href = "/preprocessing4";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 4</strong></p>
<p><strong>After Normalization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing3";
}
function pageRedirect_next() {
window.location.href = "/indexing";
}
</script>
</html>
......@@ -10,6 +10,9 @@ urlpatterns = [
path('', views.home),
path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing),
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
path('indexing/', views.indexing),
path('index/', views.index),
path('result/', views.result),
......
......@@ -39,10 +39,246 @@ def dataframe(request):
return render(request, 'apps/dataframe.html', context)
def preprocessing(request):
return render(request, 'apps/preprocessing.html')
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing.html', context)
def preprocessing2(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing2.html', context)
def preprocessing3(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing3.html', context)
def preprocessing4(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing4.html', context)
def indexing(request):
return render(request, 'apps/indexing.html')
from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[]
for i in range(N_DOC):
for j in tokens_doc[i]:
all_tokens.append(j)
new_sentences = ' '.join([w for w in all_tokens])
for j in CountVectorizer().build_tokenizer()(new_sentences):
all_tokens.append(j)
all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya}
return render(request, 'apps/indexing.html', context)
def index(request):
return render(request, 'apps/index.html')
......
......@@ -22,6 +22,9 @@ urlpatterns = [
path('', views.home),
path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing),
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
path('indexing/', views.indexing),
path('index/', views.index),
path('result/', views.result),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment