> Unstructured-Classification Hands-On Solutions - TECH UPDATE

Unstructured-Classification Hands-On Solutions

Unstructured-Classification Hands-On Solutions




The course id of Unstructured-Classification is 55943

Install --> Test --> Run --> Open Preview

Copy url and paste in next tab

click on unstructured_test.ipynb

Step1:- 

import pandas as pd

import numpy as np

import csv

Step2:- 

#Data Loading

imdb=pd.read_csv("imdb.csv")

imdb.columns = ["index","text","label"]

print(imdb.head(5))

Step3:- 

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)

print(imdb.groupby('label').describe())

print(imdb.head(3))

Step4:- 

imdb_target=imdb['label'] 

print(imdb_target)

Step5:- 

from nltk.tokenize import word_tokenize

import nltk

nltk.download('all')

def split_tokens(text):

  text = text.lower()

  word_tokens = word_tokenize(text)

  return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row: split_tokens(row['text']), axis = 1)

Step 6:- 

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

    lemma = []

    lemmatizer = WordNetLemmatizer()

    for word in text:

        a=lemmatizer.lemmatize(word)

        lemma.append(a)

    return lemma

imdb['lemmatized_message'] = imdb.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55])

print('Lemmatized message:', imdb['lemmatized_message'][55])

Step 7:- 

from nltk.corpus import stopwords

def stopword_removal(text):

    stop_words = set(stopwords.words('english'))

    filtered_sentence = []

    filtered_sentence = ' '.join([word for word in text if word not in stop_words])

    return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row: stopword_removal(row['lemmatized_message']),axis = 1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

Step 8:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)   

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

Step 9:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)


Step 10:- 

from sklearn.model_selection import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,Training_label,test_size = 0.1)

Step 11:- 

seed=9

from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape)

print("The shape of test data", test_data_shape )

classifier = SVC(kernel="linear",C=0.025,random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target = 

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:

    file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))


Step 12:-


from sklearn.linear_model import SGDClassifier

train_data,test_data, train_label, test_label = train_test_split( message_data_TDM, Training_label, test_size = 0.2)

train_data_shape = train_data.shape

test_data_shape = test_data.shape 

print("The shape of train data", train_data_shape  )

print("The shape of test data", test_data_shape )

classifier =  SGDClassifier( loss='modified_huber',shuffle = True, random_state = seed )

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:

    file.write(str((imdb['preprocessed_message'][55])))

Unstructured-Classification Hands-On Solutions Unstructured-Classification Hands-On Solutions Reviewed by TECH UPDATE on July 20, 2021 Rating: 5

5 comments:

  1. I am getting NameError for last 2 steps request you to send a proper code.
    Thank you

    ReplyDelete
    Replies
    1. Check each word,number and characters and numbers of character like (((ashen)) three brackets open then there should be three close also

      Delete
    2. Yesterday I completed the hands on with the same code.

      Delete
  2. If it is syntax error you are getting , remove double colon and give comma.

    ReplyDelete

Powered by Blogger.