ML

1. Write an Python program to perform data preprocessing on given data set.

import pandas as pd

# Load the dataset from the Excel file into a DataFrame

file_path = 'C:\\Users\\datset_noise.xlsx'

df = pd.read_excel(file_path, engine='openpyxl')

print("Original DataFrame:")

print(df.to_string(index=False))

# Step 2: Remove rows with missing values

df_no_missing = df.dropna()

print("\nDataFrame after removing rows with missing values:")

print(df_no_missing.to_string(index=False))

print(f"\nTotal count after removing rows with missing values: {len(df_no_missing)}")

# Store rows with missing values

df_missing_rows = df[df.isna().any(axis=1)]

# Step 3: Remove duplicate rows

df_no_duplicates = df_no_missing.drop_duplicates(subset=["Name","DOB","Age","Date of Joining"])

print("\nDataFrame after removing duplicate rows:")

print(df_no_duplicates.to_string(index=False))

print(f"\nTotal count after removing duplicate rows: {len(df_no_duplicates)}")

# Store duplicate rows

df_duplicate_rows = df_no_missing[df_no_missing.duplicated(subset=["Name","DOB","Age","Date of Joining"])]

# Step 4: Remove outliers (Assume Age > 60 as an outlier for example purposes)

df_no_outliers = df_no_duplicates[df_no_duplicates["Age"] <= 80 ]

print("\nDataFrame after removing outliers:")

print(df_no_outliers.to_string(index=False))

print(f"\nTotal count after removing outliers: {len(df_no_outliers)}")

# Store outlier rows

df_outlier_rows = df_no_duplicates[df_no_duplicates["Age"] > 80]

# Display the final cleaned DataFrame

print("\nFinal Cleaned DataFrame:")

print(df_no_outliers.to_string(index=False))

total_count = len(df_no_outliers)

print(f"\nTotal count of records in the cleaned data: {total_count}")

# Save the cleaned DataFrame to a CSV file

df_no_outliers.to_csv("Cleaned_Employee_Data.csv", index=False)

# Combine all deleted records

df_deleted_records = pd.concat([df_missing_rows, df_duplicate_rows, df_outlier_rows], ignore_index=True)

# Display deleted records

print("\nDeleted Records:")

print(df_deleted_records.to_string(index=False))

total_deleted = len(df) - len(df_no_outliers)

print(f"\nTotal count of deleted records: {total_deleted}")

2. Write a Python program to illustrate Naive Bayes algorithm.

from sklearn.naive_bayes import GaussianNB

import numpy as np

X = np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])

Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

model = GaussianNB()

model.fit(X, Y)

prediction= model.predict([[-2,0]])

print(prediction)

**********************************************************

import numpy as np

from sklearn.naive_bayes import GaussianNB

# Sample dataset (Study hours, Past grades)

X = np.array([

[1, 3], [2, 4], [3, 5], [4, 6], [5, 7], # Fail group

[6, 8], [7, 9], [8, 9], [9, 10] # Pass group explain the code step by step

])

# Labels (0 = Fail, 1 = Pass)

Y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1])

# Create and train the Naive Bayes classifier

model = GaussianNB()

model.fit(X, Y)

# Predict for a new student with 6 hours of study and past grade of 9

new_student = [[4,7]]

prediction = model.predict(new_student)

3. Write a Python program to illustrate Linear regression algorithm.

from sklearn.linear_model import LinearRegression

X = [[2001,5.2],[2002,5.1],[2003,5.1],[2004,4.9],[2005,5.0],[2006,5.1],[2007,5.4],[2008,5.6],[2009,5.9]]

Y = [2.5,2.52,2.54,2.48,3.28,3.2,3.15,3.26,3.29,]

len(X), len(Y)

LinR_model = LinearRegression()

LinR_model.fit(X, Y)

prediction = LinR_model.predict([[2010,5.1]])

print(prediction)

prediction_2011 = LinR_model.predict([[2011,5.2]])

print(prediction_2011)

4. Write a Python program to illustrate Logistic regression algorithm.

import numpy as np

from sklearn import linear_model

import matplotlib.pyplot as plt

X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],

[166,6],[128,32],[179,10],[136,34],[186,2],[126,25],[176,28],[112,38],

[169,9],[171,36],[116,25],[196,25]]

Y = ['Man','Woman','Woman','Man','Woman','Man','Woman','Man','Woman',

'Man','Woman','Man','Woman','Woman','Woman','Man','Woman','Woman','Man']

data_feature_names = ['height','age']

LR_model = linear_model.LogisticRegression()

LR_model.fit(X, Y)

prediction = LR_model.predict([[169,19]])

print(prediction)

print('Accuracy on the training subset is:',format(LR_model.score(X,Y)))

5. Write a Python program to illustrate SVM algorithm .

from sklearn.svm import SVC

data_feature_names = ['height','age'] # This is nothing but the 2 Columns of my dataset

X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],

[166,6],[128,32],[179,10],[136,34],[186,2],[126,25],[176,28],[112,38],

[169,9],[171,36],[116,25],[196,25]]

Y = ['Man','Woman','Woman','Man','Woman','Man','Woman','Man','Woman',

'Man','Woman','Man','Woman','Woman','Woman','Man','Woman','Woman','Man']

SVC_model = SVC(gamma='auto')

SVC_model.fit(X, Y)

print(SVC_model.predict([[156, 53]]))

print('Accuracy on the training subset:',format(SVC_model.score(X,Y)))

6. Write a Python program to illustrate KNN algorithm.

import numpy as np # Numerical computations.

import matplotlib.pyplot as plt # Data Visualization

from sklearn.neighbors import NearestNeighbors

A = np.array(

[[3.1, 2.3],[2.3, 4.2],[3.9, 3.5],[3.7, 6.4],[4.8, 1.9],[8.3, 3.1],[5.2, 7.5],[4.8, 4.7],[3.5, 5.1],[4.4, 2.9]]

)

plt.figure()

plt.title('Input data')

plt.scatter(A[:,0], A[:,1], marker = 'x', s = 50, color = 'red')

test_data = [5.2, 2.9]

knn_model = NearestNeighbors(n_neighbors = 3, algorithm = 'auto')

knn_model.fit(A)

distances, indices = knn_model.kneighbors([test_data])

print(distances)

print(indices)

print("\nK Nearest Neighbors:")

for rank, index in enumerate(indices[0][:3], start = 1): # Iterates over the indices of the k-nearest neighbors. indices[0][:3] because indices is a 2D array which has only one row which contains the index number of the Nearest Neighbors.

print(str(rank) + " is", A[index])

plt.figure()

plt.title('Nearest neighbors')

plt.scatter(A[:, 0], A[:, 1], marker = 'x', s = 100, color = 'red')

plt.scatter(test_data[0], test_data[1],marker = 'x', s = 100, color = 'blue')

plt.show()

7.Write a Python program to illustrate K-means algorithm.

from sklearn.cluster import KMeans

data_features = ["Height", "Age"]

X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],

[166,6],[128,32],[179,10],[136,34],[186,20],[126,25],[176,28],[112,38],

[169,9],[171,36],[116,25],[196,25]]

model = KMeans(n_clusters=3)

# Fitting Model

model.fit(X)

cluster_labels = model.predict(X)

# Printing Predictions

print(cluster_labels)

x1 = [] # hieght

x2 = [] # age

for item in X:

x1.append(item[0])

x2.append(item[1])

print(x1)

print(x2)

import matplotlib.pyplot as plt

plt.scatter(x1,x2, c=model.labels_)

plt.show()

8. Write a Python program to illustrate Apriori algorithm.

#importing tht necessary library

from efficient_apriori import apriori

transactions=[

['butter','milk','bread'],

['butter','milk','apple'],

['bread','milk','banana'],

['milk','bread','butter']

]

itemsets,rules=apriori(transactions,min_support=0.3,min_confidence=0.8)

print("Frequent itemsets:")

for k,v in itemsets.items():

print(f"level {k}:{v}")

print("\n Association Rules:")

for rule in rules:

print(rule)

Search This Blog

Yashwin

ML

Comments

Post a Comment

Popular posts from this blog

Cloud