> Unstructured-Classification Hands-On Solutions - TECH UPDATE

Unstructured-Classification Hands-On Solutions

Unstructured-Classification Hands-On Solutions




The course id of Unstructured-Classification is 55943

Install --> Test --> Run --> Open Preview

Copy url and paste in next tab

click on unstructured_test.ipynb

Step1:- 

import pandas as pd

import numpy as np

import csv

Step2:- 

#Data Loading

imdb=pd.read_csv("imdb.csv")

imdb.columns = ["index","text","label"]

print(imdb.head(5))

Step3:- 

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)

print(imdb.groupby('label').describe())

print(imdb.head(3))

Step4:- 

imdb_target=imdb['label'] 

print(imdb_target)

Step5:- 

from nltk.tokenize import word_tokenize

import nltk

nltk.download('all')

def split_tokens(text):

  text = text.lower()

  word_tokens = word_tokenize(text)

  return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row: split_tokens(row['text']), axis = 1)

Step 6:- 

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

    lemma = []

    lemmatizer = WordNetLemmatizer()

    for word in text:

        a=lemmatizer.lemmatize(word)

        lemma.append(a)

    return lemma

imdb['lemmatized_message'] = imdb.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55])

print('Lemmatized message:', imdb['lemmatized_message'][55])

Step 7:- 

from nltk.corpus import stopwords

def stopword_removal(text):

    stop_words = set(stopwords.words('english'))

    filtered_sentence = []

    filtered_sentence = ' '.join([word for word in text if word not in stop_words])

    return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row: stopword_removal(row['lemmatized_message']),axis = 1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

Step 8:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)   

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

Step 9:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)


Step 10:- 

from sklearn.model_selection import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,Training_label,test_size = 0.1)

Step 11:- 

seed=9

from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape)

print("The shape of test data", test_data_shape )

classifier = SVC(kernel="linear",C=0.025,random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target = 

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:

    file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))


Step 12:-


from sklearn.linear_model import SGDClassifier

train_data,test_data, train_label, test_label = train_test_split( message_data_TDM, Training_label, test_size = 0.2)

train_data_shape = train_data.shape

test_data_shape = test_data.shape 

print("The shape of train data", train_data_shape  )

print("The shape of test data", test_data_shape )

classifier =  SGDClassifier( loss='modified_huber',shuffle = True, random_state = seed )

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:

    file.write(str((imdb['preprocessed_message'][55])))

Unstructured-Classification Hands-On Solutions Unstructured-Classification Hands-On Solutions Reviewed by TECH UPDATE on July 20, 2021 Rating: 5

2 comments:

  1. I am getting NameError for last 2 steps request you to send a proper code.
    Thank you

    ReplyDelete

Powered by Blogger.