{"id":316,"date":"2018-05-11T11:47:04","date_gmt":"2018-05-11T11:47:04","guid":{"rendered":"http:\/\/ai.intelligentonlinetools.com\/ml\/?page_id=316"},"modified":"2018-05-12T23:03:43","modified_gmt":"2018-05-12T23:03:43","slug":"text-classification-tweet-dataset-using-convolutional-neural-network","status":"publish","type":"page","link":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/","title":{"rendered":"Text Classification for Tweet Dataset using Convolutional Neural Network"},"content":{"rendered":"<div class=\"bzryg69efa4b195a29\" ><script async src=\"\/\/pagead2.googlesyndication.com\/pagead\/js\/adsbygoogle.js\"><\/script>\n<!-- Text analytics techniques 728_90 horizontal top -->\n<ins class=\"adsbygoogle\"\n     style=\"display:inline-block;width:728px;height:90px\"\n     data-ad-client=\"ca-pub-3416618249440971\"\n     data-ad-slot=\"2926649501\"><\/ins>\n<script>\n(adsbygoogle = window.adsbygoogle || []).push({});\n<\/script><\/div><style type=\"text\/css\">\r\n.bzryg69efa4b195a29 {\r\nmargin: 5px; padding: 0px;\r\n}\r\n@media screen and (min-width: 1201px) {\r\n.bzryg69efa4b195a29 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 993px) and (max-width: 1200px) {\r\n.bzryg69efa4b195a29 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 769px) and (max-width: 992px) {\r\n.bzryg69efa4b195a29 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 768px) and (max-width: 768px) {\r\n.bzryg69efa4b195a29 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (max-width: 767px) {\r\n.bzryg69efa4b195a29 {\r\ndisplay: block;\r\n}\r\n}\r\n<\/style>\r\n<p>This is source code for <a href=\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-cnn-convolutional-neural-network-python\/\" target=\"_blank\">Text Classification for Different Datasets<\/a> CNN based on the code from<br \/>\n<a href=\"https:\/\/richliao.github.io\/supervised\/classification\/2016\/11\/26\/textclassifier-convolutional\/\">Text Classification, Part I &#8211; Convolutional Networks<\/a>  <\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\n# -*- coding: utf-8 -*-\r\n\r\nimport csv\r\nimport re\r\nfrom nltk.corpus import stopwords\r\n\r\nstopword_set = set(stopwords.words(&quot;english&quot;)) \r\n\r\ndef preprocess(raw_text):\r\n\r\n    # keep only words\r\n    letters_only_text = re.sub(&quot;[^a-zA-Z]&quot;, &quot; &quot;, raw_text)\r\n\r\n    # convert to lower case and split \r\n    words = letters_only_text.lower().split()\r\n\r\n    # remove stopwords\r\n   \r\n    meaningful_words = [w for w in words if w not in stopword_set]\r\n\r\n    # join the cleaned words in a list\r\n    cleaned_word_list = &quot; &quot;.join(meaningful_words)\r\n\r\n    return cleaned_word_list\r\n\r\ndef preprocess2(raw_text):\r\n    stopword_set = set(stopwords.words(&quot;english&quot;))\r\n    return &quot; &quot;.join([i for i in re.sub(r'[^a-zA-Z\\s]', &quot;&quot;, raw_text).lower().split() if i not in stopword_set])\r\n\r\n\r\ndef get_index (key):\r\n    \tif (dict.has_key(key)):\r\n         return dict[key]\r\n    \telse:\r\n         dict_count=dict.values\r\n         new_item = {key : dict_count }\r\n         dict.update(new_item)\r\n         return new_item\r\n    \r\n\r\n# from https:\/\/pythonspot.com\/reading-csv-files-in-python\/\r\ndef readMyFile(filename):\r\n    text = []\r\n    categories = []\r\n    dict = {}\r\n \r\n    with open(filename, encoding='latin-1') as csvDataFile:\r\n        csvReader = csv.reader(csvDataFile)\r\n        for row in csvReader:\r\n          \t if (row[4] in dict):\r\n          \t    ind= dict[row[4]]\r\n          \t else:\r\n          \t    dict_count=len(dict)\r\n          \t    new_item = {row[4] : dict_count }\r\n          \t    dict.update(new_item)\r\n          \t    ind=dict_count\r\n\r\n          \t categories.append(ind)\r\n        \r\n          \t text.append(row[1] + &quot; &quot; + row[0] + &quot; &quot;+ row[6])\r\n \r\n    return categories, text\r\n \r\nfn=&quot;C:\\\\Users\\\\New-years-resolutions-DFE.csv&quot;\r\nlabels,new_texts = readMyFile(fn)\r\n \r\nprint(new_texts[0])\r\nprint(new_texts[1])\r\nprint(labels[0])\r\nprint(labels[1])\r\nprint(labels)\r\n\r\ntexts=[]\r\nfor txt in new_texts:\r\n    txt=preprocess2(txt)\r\n    texts.append(txt)\r\n \r\n\r\nimport numpy as np\r\nimport os\r\n\r\n\r\n\r\nos.environ['KERAS_BACKEND']='tensorflow'\r\n\r\nfrom keras.preprocessing.text import Tokenizer\r\n\r\nfrom keras.preprocessing.sequence import pad_sequences\r\n\r\nfrom keras.utils.np_utils import to_categorical\r\n\r\nfrom keras.layers import Dense, Input, Flatten\r\n\r\nfrom keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout\r\n\r\nfrom keras.models import Model\r\n\r\n\r\nMAX_SEQUENCE_LENGTH = 1000\r\n\r\nMAX_NB_WORDS = 20000\r\n\r\nEMBEDDING_DIM = 100\r\n\r\nVALIDATION_SPLIT = 0.2\r\n\r\ntokenizer = Tokenizer(nb_words=MAX_NB_WORDS)\r\n\r\ntokenizer.fit_on_texts(texts)\r\n\r\nsequences = tokenizer.texts_to_sequences(texts)\r\n\r\nword_index = tokenizer.word_index\r\n\r\nprint('Found %s unique tokens.' % len(word_index))\r\n\r\ndata = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\r\n\r\nlabels = to_categorical(np.asarray(labels))\r\n\r\nprint('Shape of data tensor:', data.shape)\r\n\r\nprint('Shape of label tensor:', labels.shape)\r\n\r\nindices = np.arange(data.shape[0])\r\n\r\nnp.random.shuffle(indices)\r\n\r\ndata = data[indices]\r\n\r\nlabels = labels[indices]\r\n\r\nnb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])\r\n\r\nx_train = data[:-nb_validation_samples]\r\n\r\ny_train = labels[:-nb_validation_samples]\r\n\r\nx_val = data[-nb_validation_samples:]\r\n\r\ny_val = labels[-nb_validation_samples:]\r\n\r\n\r\n\r\nprint('Number of positive and negative reviews in traing and validation set ')\r\n\r\nprint (y_train.sum(axis=0))\r\n\r\nprint (y_val.sum(axis=0))\r\n\r\nGLOVE_DIR = &quot;C:\\\\Users\\\\pythonrunfiles&quot;\r\n\r\nembeddings_index = {}\r\n\r\nf = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding=&quot;utf-8&quot;)\r\n\r\nfor line in f:\r\n   \r\n    values = line.split()\r\n\r\n    word = values[0]\r\n\r\n    coefs = np.asarray(values[1:], dtype='float32')\r\n\r\n    embeddings_index[word] = coefs\r\n\r\nf.close()\r\n\r\n\r\n\r\nprint('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))\r\n\r\n\r\n\r\nembedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))\r\n\r\nfor word, i in word_index.items():\r\n\r\n    embedding_vector = embeddings_index.get(word)\r\n\r\n    if embedding_vector is not None:\r\n\r\n        # words not found in embedding index will be all-zeros.\r\n\r\n        embedding_matrix[i] = embedding_vector\r\n\r\n        \r\n\r\nembedding_layer = Embedding(len(word_index) + 1,\r\n\r\n                            EMBEDDING_DIM,\r\n\r\n                            weights=[embedding_matrix],\r\n\r\n                            input_length=MAX_SEQUENCE_LENGTH,\r\n\r\n                            trainable=True)\r\n\r\n\r\n\r\nsequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\r\n\r\nembedded_sequences = embedding_layer(sequence_input)\r\n\r\nl_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)\r\n\r\nl_pool1 = MaxPooling1D(5)(l_cov1)\r\n\r\nl_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)\r\n\r\nl_pool2 = MaxPooling1D(5)(l_cov2)\r\n\r\nl_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)\r\n\r\nl_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling\r\n\r\nl_flat = Flatten()(l_pool3)\r\n\r\nl_dense = Dense(128, activation='relu')(l_flat)\r\n\r\npreds = Dense(11, activation='softmax')(l_dense)    # was 2 instead 11\r\n\r\n\r\n\r\nmodel = Model(sequence_input, preds)\r\n\r\nmodel.compile(loss='categorical_crossentropy',\r\n\r\n              optimizer='rmsprop',\r\n\r\n              metrics=['acc'])\r\n\r\n\r\n\r\nprint(&quot;model fitting - simplified convolutional neural network&quot;)\r\n\r\nmodel.summary()\r\n\r\nprint (x_train)\r\nprint (y_train)\r\n\r\nmodel.fit(x_train, y_train, validation_data=(x_val, y_val),\r\n\r\n          nb_epoch=1, batch_size=128)       #epoch was 10\r\n\r\n\r\n\r\nembedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))\r\n\r\nfor word, i in word_index.items():\r\n\r\n    embedding_vector = embeddings_index.get(word)\r\n\r\n    if embedding_vector is not None:\r\n\r\n        # words not found in embedding index will be all-zeros.\r\n\r\n        embedding_matrix[i] = embedding_vector\r\n\r\n        \r\n\r\nembedding_layer = Embedding(len(word_index) + 1,\r\n\r\n                            EMBEDDING_DIM,\r\n\r\n                            weights=[embedding_matrix],\r\n\r\n                            input_length=MAX_SEQUENCE_LENGTH,\r\n\r\n                            trainable=True)\r\n\r\n\r\n\r\n# applying a more complex convolutional approach\r\n\r\nconvs = []\r\n\r\nfilter_sizes = [3,4,5]\r\n\r\n\r\n\r\nsequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\r\n\r\nembedded_sequences = embedding_layer(sequence_input)\r\n\r\n\r\n\r\nfor fsz in filter_sizes:\r\n\r\n    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)\r\n\r\n    l_pool = MaxPooling1D(5)(l_conv)\r\n\r\n    convs.append(l_pool)\r\n\r\n    \r\n\r\nl_merge = Merge(mode='concat', concat_axis=1)(convs)\r\n\r\nl_cov1= Conv1D(128, 5, activation='relu')(l_merge)\r\n\r\nl_pool1 = MaxPooling1D(5)(l_cov1)\r\n\r\nl_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)\r\n\r\nl_pool2 = MaxPooling1D(30)(l_cov2)\r\n\r\nl_flat = Flatten()(l_pool2)\r\n\r\nl_dense = Dense(128, activation='relu')(l_flat)\r\n\r\npreds = Dense(11, activation='softmax')(l_dense)     \r\n\r\n\r\n\r\nmodel = Model(sequence_input, preds)\r\n\r\nmodel.compile(loss='categorical_crossentropy',\r\n\r\n              optimizer='rmsprop',\r\n\r\n              metrics=['acc'])\r\n\r\n\r\n\r\nprint(&quot;model fitting - more complex convolutional neural network&quot;)\r\n\r\nmodel.summary()\r\n\r\nmodel.fit(x_train, y_train, validation_data=(x_val, y_val),\r\n\r\n          nb_epoch=2, batch_size=50)  \r\n\r\n<\/pre>\n<div class=\"pofqm69efa4b195a57\" ><center>\n<script async src=\"\/\/pagead2.googlesyndication.com\/pagead\/js\/adsbygoogle.js\"><\/script>\n<!-- Text analytics techniques link ads horizontal Medium after content -->\n<ins class=\"adsbygoogle\"\n     style=\"display:inline-block;width:468px;height:15px\"\n     data-ad-client=\"ca-pub-3416618249440971\"\n     data-ad-slot=\"5765984772\"><\/ins>\n<script>\n(adsbygoogle = window.adsbygoogle || []).push({});\n<\/script>\n\n<script async src=\"\/\/pagead2.googlesyndication.com\/pagead\/js\/adsbygoogle.js\"><\/script>\n<ins class=\"adsbygoogle\"\n     style=\"display:block\"\n     data-ad-format=\"autorelaxed\"\n     data-ad-client=\"ca-pub-3416618249440971\"\n     data-ad-slot=\"3903486841\"><\/ins>\n<script>\n     (adsbygoogle = window.adsbygoogle || []).push({});\n<\/script>\n<\/center><\/div><style type=\"text\/css\">\r\n.pofqm69efa4b195a57 {\r\nmargin: 5px; padding: 0px;\r\n}\r\n@media screen and (min-width: 1201px) {\r\n.pofqm69efa4b195a57 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 993px) and (max-width: 1200px) {\r\n.pofqm69efa4b195a57 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 769px) and (max-width: 992px) {\r\n.pofqm69efa4b195a57 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (min-width: 768px) and (max-width: 768px) {\r\n.pofqm69efa4b195a57 {\r\ndisplay: block;\r\n}\r\n}\r\n@media screen and (max-width: 767px) {\r\n.pofqm69efa4b195a57 {\r\ndisplay: block;\r\n}\r\n}\r\n<\/style>\r\n","protected":false},"excerpt":{"rendered":"<p>This is source code for Text Classification for Different Datasets CNN based on the code from Text Classification, Part I &#8211; Convolutional Networks<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0},"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques\" \/>\n<meta property=\"og:description\" content=\"This is source code for Text Classification for Different Datasets CNN based on the code from Text Classification, Part I &#8211; Convolutional Networks\" \/>\n<meta property=\"og:url\" content=\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/\" \/>\n<meta property=\"og:site_name\" content=\"Text Analytics Techniques\" \/>\n<meta property=\"article:modified_time\" content=\"2018-05-12T23:03:43+00:00\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data1\" content=\"5 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/\",\"url\":\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/\",\"name\":\"Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques\",\"isPartOf\":{\"@id\":\"https:\/\/ai.intelligentonlinetools.com\/ml\/#website\"},\"datePublished\":\"2018-05-11T11:47:04+00:00\",\"dateModified\":\"2018-05-12T23:03:43+00:00\",\"breadcrumb\":{\"@id\":\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/ai.intelligentonlinetools.com\/ml\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Text Classification for Tweet Dataset using Convolutional Neural Network\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/ai.intelligentonlinetools.com\/ml\/#website\",\"url\":\"https:\/\/ai.intelligentonlinetools.com\/ml\/\",\"name\":\"Text Analytics Techniques\",\"description\":\"Text Analytics Techniques\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/ai.intelligentonlinetools.com\/ml\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"en-US\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/","og_locale":"en_US","og_type":"article","og_title":"Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques","og_description":"This is source code for Text Classification for Different Datasets CNN based on the code from Text Classification, Part I &#8211; Convolutional Networks","og_url":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/","og_site_name":"Text Analytics Techniques","article_modified_time":"2018-05-12T23:03:43+00:00","twitter_card":"summary_large_image","twitter_misc":{"Est. reading time":"5 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/","url":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/","name":"Text Classification for Tweet Dataset using Convolutional Neural Network - Text Analytics Techniques","isPartOf":{"@id":"https:\/\/ai.intelligentonlinetools.com\/ml\/#website"},"datePublished":"2018-05-11T11:47:04+00:00","dateModified":"2018-05-12T23:03:43+00:00","breadcrumb":{"@id":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/"]}]},{"@type":"BreadcrumbList","@id":"http:\/\/ai.intelligentonlinetools.com\/ml\/text-classification-tweet-dataset-using-convolutional-neural-network\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/ai.intelligentonlinetools.com\/ml\/"},{"@type":"ListItem","position":2,"name":"Text Classification for Tweet Dataset using Convolutional Neural Network"}]},{"@type":"WebSite","@id":"https:\/\/ai.intelligentonlinetools.com\/ml\/#website","url":"https:\/\/ai.intelligentonlinetools.com\/ml\/","name":"Text Analytics Techniques","description":"Text Analytics Techniques","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/ai.intelligentonlinetools.com\/ml\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"en-US"}]}},"_links":{"self":[{"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/pages\/316"}],"collection":[{"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/comments?post=316"}],"version-history":[{"count":3,"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/pages\/316\/revisions"}],"predecessor-version":[{"id":334,"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/pages\/316\/revisions\/334"}],"wp:attachment":[{"href":"http:\/\/ai.intelligentonlinetools.com\/ml\/wp-json\/wp\/v2\/media?parent=316"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}