✨ AI/ML
Machine Learning with Python
Last updated: 2025-09-25 12:47:03
Machine Learning Fundamentals
Get started with machine learning using Python, scikit-learn, and popular ML libraries.
Setting Up the Environment
# Install required packages
pip install numpy pandas matplotlib scikit-learn jupyter
# For deep learning
pip install tensorflow keras pytorch torchvision
# Data visualization
pip install seaborn plotlyData Preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
# Load dataset
df = pd.read_csv('data.csv')
# Basic data exploration
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_numeric = df.select_dtypes(include=[np.number])
df[df_numeric.columns] = imputer.fit_transform(df_numeric)
# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
df[col] = le.fit_transform(df[col].astype(str))
# Feature scaling
scaler = StandardScaler()
X = df.drop('target', axis=1)
y = df['target']
X_scaled = scaler.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)Classification Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Classification Report:")
print(classification_report(y_test, rf_pred))
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
# Support Vector Machine
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
# Compare models
models = {
'Random Forest': accuracy_score(y_test, rf_pred),
'Logistic Regression': accuracy_score(y_test, lr_pred),
'SVM': accuracy_score(y_test, svm_pred)
}
for model, accuracy in models.items():
print(f"{model}: {accuracy:.4f}")Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# Evaluate models
print("Linear Regression:")
print(f"MSE: {mean_squared_error(y_test, lr_pred):.4f}")
print(f"R²: {r2_score(y_test, lr_pred):.4f}")
print("\nRandom Forest Regression:")
print(f"MSE: {mean_squared_error(y_test, rf_pred):.4f}")
print(f"R²: {r2_score(y_test, rf_pred):.4f}")
# Visualization
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test, lr_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression')
plt.subplot(1, 2, 2)
plt.scatter(y_test, rf_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest')
plt.tight_layout()
plt.show()Cross-Validation and Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
# Cross-validation
rf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Mean CV score: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
# Hyperparameter tuning with GridSearch
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
# Use best model
best_model = grid_search.best_estimator_
best_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, best_pred))