ML
1. Write an Python program to perform data preprocessing on given data set.
import pandas as pd
# Load the dataset from the Excel file into a DataFrame
file_path = 'C:\\Users\\datset_noise.xlsx'
df = pd.read_excel(file_path, engine='openpyxl')
print("Original DataFrame:")
print(df.to_string(index=False))
# Step 2: Remove rows with missing values
df_no_missing = df.dropna()
print("\nDataFrame after removing rows with missing values:")
print(df_no_missing.to_string(index=False))
print(f"\nTotal count after removing rows with missing values: {len(df_no_missing)}")
# Store rows with missing values
df_missing_rows = df[df.isna().any(axis=1)]
# Step 3: Remove duplicate rows
df_no_duplicates = df_no_missing.drop_duplicates(subset=["Name","DOB","Age","Date of Joining"])
print("\nDataFrame after removing duplicate rows:")
print(df_no_duplicates.to_string(index=False))
print(f"\nTotal count after removing duplicate rows: {len(df_no_duplicates)}")
# Store duplicate rows
df_duplicate_rows = df_no_missing[df_no_missing.duplicated(subset=["Name","DOB","Age","Date of Joining"])]
# Step 4: Remove outliers (Assume Age > 60 as an outlier for example purposes)
df_no_outliers = df_no_duplicates[df_no_duplicates["Age"] <= 80 ]
print("\nDataFrame after removing outliers:")
print(df_no_outliers.to_string(index=False))
print(f"\nTotal count after removing outliers: {len(df_no_outliers)}")
# Store outlier rows
df_outlier_rows = df_no_duplicates[df_no_duplicates["Age"] > 80]
# Display the final cleaned DataFrame
print("\nFinal Cleaned DataFrame:")
print(df_no_outliers.to_string(index=False))
total_count = len(df_no_outliers)
print(f"\nTotal count of records in the cleaned data: {total_count}")
# Save the cleaned DataFrame to a CSV file
df_no_outliers.to_csv("Cleaned_Employee_Data.csv", index=False)
# Combine all deleted records
df_deleted_records = pd.concat([df_missing_rows, df_duplicate_rows, df_outlier_rows], ignore_index=True)
# Display deleted records
print("\nDeleted Records:")
print(df_deleted_records.to_string(index=False))
total_deleted = len(df) - len(df_no_outliers)
print(f"\nTotal count of deleted records: {total_deleted}")
2. Write a Python program to illustrate Naive Bayes algorithm.
from sklearn.naive_bayes import GaussianNB
import numpy as np
X = np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])
model = GaussianNB()
model.fit(X, Y)
prediction= model.predict([[-2,0]])
print(prediction)
**********************************************************
import numpy as np
from sklearn.naive_bayes import GaussianNB
# Sample dataset (Study hours, Past grades)
X = np.array([
[1, 3], [2, 4], [3, 5], [4, 6], [5, 7], # Fail group
[6, 8], [7, 9], [8, 9], [9, 10] # Pass group explain the code step by step
])
# Labels (0 = Fail, 1 = Pass)
Y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1])
# Create and train the Naive Bayes classifier
model = GaussianNB()
model.fit(X, Y)
# Predict for a new student with 6 hours of study and past grade of 9
new_student = [[4,7]]
prediction = model.predict(new_student)
3. Write a Python program to illustrate Linear regression algorithm.
from sklearn.linear_model import LinearRegression
X = [[2001,5.2],[2002,5.1],[2003,5.1],[2004,4.9],[2005,5.0],[2006,5.1],[2007,5.4],[2008,5.6],[2009,5.9]]
Y = [2.5,2.52,2.54,2.48,3.28,3.2,3.15,3.26,3.29,]
len(X), len(Y)
LinR_model = LinearRegression()
LinR_model.fit(X, Y)
prediction = LinR_model.predict([[2010,5.1]])
print(prediction)
prediction_2011 = LinR_model.predict([[2011,5.2]])
print(prediction_2011)
4. Write a Python program to illustrate Logistic regression algorithm.
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],
[166,6],[128,32],[179,10],[136,34],[186,2],[126,25],[176,28],[112,38],
[169,9],[171,36],[116,25],[196,25]]
Y = ['Man','Woman','Woman','Man','Woman','Man','Woman','Man','Woman',
'Man','Woman','Man','Woman','Woman','Woman','Man','Woman','Woman','Man']
data_feature_names = ['height','age']
LR_model = linear_model.LogisticRegression()
LR_model.fit(X, Y)
prediction = LR_model.predict([[169,19]])
print(prediction)
print('Accuracy on the training subset is:',format(LR_model.score(X,Y)))
5. Write a Python program to illustrate SVM algorithm .
from sklearn.svm import SVC
data_feature_names = ['height','age'] # This is nothing but the 2 Columns of my dataset
X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],
[166,6],[128,32],[179,10],[136,34],[186,2],[126,25],[176,28],[112,38],
[169,9],[171,36],[116,25],[196,25]]
Y = ['Man','Woman','Woman','Man','Woman','Man','Woman','Man','Woman',
'Man','Woman','Man','Woman','Woman','Woman','Man','Woman','Woman','Man']
SVC_model = SVC(gamma='auto')
SVC_model.fit(X, Y)
print(SVC_model.predict([[156, 53]]))
print('Accuracy on the training subset:',format(SVC_model.score(X,Y)))
6. Write a Python program to illustrate KNN algorithm.
import numpy as np # Numerical computations.
import matplotlib.pyplot as plt # Data Visualization
from sklearn.neighbors import NearestNeighbors
A = np.array(
[[3.1, 2.3],[2.3, 4.2],[3.9, 3.5],[3.7, 6.4],[4.8, 1.9],[8.3, 3.1],[5.2, 7.5],[4.8, 4.7],[3.5, 5.1],[4.4, 2.9]]
)
plt.figure()
plt.title('Input data')
plt.scatter(A[:,0], A[:,1], marker = 'x', s = 50, color = 'red')
test_data = [5.2, 2.9]
knn_model = NearestNeighbors(n_neighbors = 3, algorithm = 'auto')
knn_model.fit(A)
distances, indices = knn_model.kneighbors([test_data])
print(distances)
print(indices)
print("\nK Nearest Neighbors:")
for rank, index in enumerate(indices[0][:3], start = 1): # Iterates over the indices of the k-nearest neighbors. indices[0][:3] because indices is a 2D array which has only one row which contains the index number of the Nearest Neighbors.
print(str(rank) + " is", A[index])
plt.figure()
plt.title('Nearest neighbors')
plt.scatter(A[:, 0], A[:, 1], marker = 'x', s = 100, color = 'red')
plt.scatter(test_data[0], test_data[1],marker = 'x', s = 100, color = 'blue')
plt.show()
7.Write a Python program to illustrate K-means algorithm.
from sklearn.cluster import KMeans
data_features = ["Height", "Age"]
X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32],
[166,6],[128,32],[179,10],[136,34],[186,20],[126,25],[176,28],[112,38],
[169,9],[171,36],[116,25],[196,25]]
model = KMeans(n_clusters=3)
# Fitting Model
model.fit(X)
cluster_labels = model.predict(X)
# Printing Predictions
print(cluster_labels)
x1 = [] # hieght
x2 = [] # age
for item in X:
x1.append(item[0])
x2.append(item[1])
print(x1)
print(x2)
import matplotlib.pyplot as plt
plt.scatter(x1,x2, c=model.labels_)
plt.show()
8. Write a Python program to illustrate Apriori algorithm.
#importing tht necessary library
from efficient_apriori import apriori
transactions=[
['butter','milk','bread'],
['butter','milk','apple'],
['bread','milk','banana'],
['milk','bread','butter']
]
itemsets,rules=apriori(transactions,min_support=0.3,min_confidence=0.8)
print("Frequent itemsets:")
for k,v in itemsets.items():
print(f"level {k}:{v}")
print("\n Association Rules:")
for rule in rules:
print(rule)
Comments
Post a Comment