Access code
Amr edited this page 3 months ago

Necessary imports

import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier, IsolationForest, VotingClassifier from sklearn.cluster import KMeans from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score from imblearn.over_sampling import ADASYN from imblearn.under_sampling import RandomUnderSampler from keras.models import Sequential from keras.layers import Dense from keras.callbacks import EarlyStopping from user_agents import parse from urllib.parse import urlparse, parse_qs from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt

Load CSV file into a DataFrame

file_path = 'D:\codes\logs\Web server access logs\Web server access logs.csv' df = pd.read_csv(file_path)

Step 2: Explore the Data

print(df.info()) print(df.head())

Data Preprocessing

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z')

Feature Engineering

df['date'] = df['Timestamp'].dt.date df['hour'] = df['Timestamp'].dt.hour

Parse URL components

df['path'] = df['URL'].apply(lambda x: urlparse(x).path) df['query_params'] = df['URL'].apply(lambda x: parse_qs(urlparse(x).query)) df['domain'] = df['URL'].apply(lambda x: urlparse(x).netloc) df['subdomain'] = df['URL'].apply(lambda x: urlparse(x).hostname.split('.')[0] if urlparse(x).hostname else '') df['file_extension'] = df['URL'].apply(lambda x: x.split('.')[-1] if '.' in x else '') df['num_slashes'] = df['URL'].apply(lambda x: x.count('/'))

Parse UserAgent for device, browser, and OS information

df['device_type'] = df['UserAgent'].apply(lambda x: parse(x).device.family) df['browser'] = df['UserAgent'].apply(lambda x: parse(x).browser.family) df['os'] = df['UserAgent'].apply(lambda x: parse(x).os.family)

Extract day of the week and weekend flag

df['day_of_week'] = df['Timestamp'].dt.day_name() df['is_weekend'] = df['Timestamp'].dt.weekday // 5

Extract subdirectories from path

df['subdirectories'] = df['path'].apply(lambda x: x.split('/')[1:] if x.startswith('/') else []) df['url_length'] = df['URL'].apply(len) df['num_query_params'] = df['query_params'].apply(lambda x: len(x))

print(df.info())

Session Information Feature Extraction

Assuming 'IP' column exists for the IP address and a session timeout of 30 minutes

session_timeout = pd.Timedelta(minutes=30) df = df.sort_values(['IP', 'Timestamp']) df['session_id'] = (df.groupby('IP')['Timestamp'].diff() > session_timeout).cumsum()

Calculate session duration and requests per session

session_info = df.groupby(['IP', 'session_id']).agg(session_start=('Timestamp', 'min'),

                                                 session_end=('Timestamp', 'max'),
                                                 requests_per_session=('Timestamp', 'count'))

session_info['session_duration'] = (session_info['session_end'] - session_info['session_start']).dt.total_seconds()

Calculate requests per session

session_info['requests_per_session'] = session_info.groupby(['IP', 'session_id'])['session_start'].transform('count')

Merge requests per session back to the original DataFrame

df = df.merge(session_info[['requests_per_session']], on=['IP', 'session_id'], how='left')

Merge session info back to the original DataFrame

df = df.merge(session_info[['session_duration']], on=['IP', 'session_id'], how='left')

Count repeated requests within a short time frame

df.sort_values(['IP', 'Timestamp'], inplace=True) df['prev_timestamp'] = df.groupby('IP')['Timestamp'].shift(1) df['time_diff'] = (df['Timestamp'] - df['prev_timestamp']).dt.total_seconds() df['is_repeated'] = (df['path'] == df['path'].shift(1)) & (df['IP'] == df['IP'].shift(1)) & (df['time_diff'] < 60) # 60 seconds threshold df['is_repeated'] = df['is_repeated'].astype(int)

Encode HTTP status codes - focusing on 409 Conflict as an example

df['status_409'] = (df['Status'] == 409).astype(int) print(df.info())

Temporal Features

df['hour_of_day'] = df['Timestamp'].dt.hour df['day_of_week'] = df['Timestamp'].dt.dayofweek df['month'] = df['Timestamp'].dt.month

Request and Response Features

df['status_4xx'] = df['Status'].apply(lambda x: 1 if str(x).startswith('4') else 0) df['status_5xx'] = df['Status'].apply(lambda x: 1 if str(x).startswith('5') else 0) df['num_query_params'] = df['query_params'].apply(lambda x: len(x))

Session-based Features

df['requests_per_session'] = df.groupby(['IP', 'session_id'])['IP'].transform('count') df['session_duration'] = (df.groupby(['IP', 'session_id'])['Timestamp'].transform('max') - df.groupby(['IP', 'session_id'])['Timestamp'].transform('min')).dt.total_seconds()

Device and User Agent Features

df['device_type'] = df['UserAgent'].apply(lambda x: parse(x).device.family) df['browser'] = df['UserAgent'].apply(lambda x: parse(x).browser.family) df['os'] = df['UserAgent'].apply(lambda x: parse(x).os.family) print(df.info())

Time Series Analysis

Moving averages

df['requests_ma_10'] = df['requests_per_session'].rolling(window=10).mean() df['session_duration_ma_10'] = df['session_duration'].rolling(window=10).mean()

Feature Interactions

df['hour_day_interaction'] = df['hour_of_day'] * df['day_of_week']

Aggregated Features

df['avg_requests_per_ip'] = df.groupby('IP')['requests_per_session'].transform('mean') df['std_session_duration_per_ip'] = df.groupby('IP')['session_duration'].transform('std')

Statistical Features

df['median_requests_per_ip'] = df.groupby('IP')['requests_per_session'].transform('median') df['mean_session_duration_per_ip'] = df.groupby('IP')['session_duration'].transform('mean')

Feature Crosses

df['device_browser_combination'] = df['devicetype'] + '' + df['browser']

Erroneous Data Handling

Assuming 'session_duration' should not be negative

df = df[df['session_duration'] >= 0]

print(df.info())

Entropy of request types

from scipy.stats import entropy df['method_entropy'] = df.groupby(['session_id'])['Method'].transform(lambda x: entropy(x.value_counts(normalize=True)))

Rate of requests

df['requests_per_minute'] = df.groupby(['session_id', df['Timestamp'].dt.minute])['URL'].transform('count')

Cyclical Encoding

df['hour_sin'] = np.sin(df['hour_of_day'] * (2. * np.pi / 24)) df['hour_cos'] = np.cos(df['hour_of_day'] * (2. * np.pi / 24))

Interaction Features

Combination of Different Feature Types

df['status_hour_interaction'] = df['status_409'] * df['hour_of_day'] df['requests_minute_interaction'] = df['requests_per_minute'] * df['hour_of_day']

Frequency Analysis of Endpoints

df['endpoint_frequency'] = df['path'].map(df['path'].value_counts())

Multidimensional Time Series Features

Window Functions

df['rolling_avg_requests'] = df['requests_per_session'].rolling(window=10).mean() df['rolling_var_requests'] = df['requests_per_session'].rolling(window=10).var()

Exponential Weighted Moving Average (EWMA)

df['ewma_requests'] = df['requests_per_session'].ewm(alpha=0.3).mean()

Aggregate statistics of bytes sent per IP address

bytes_sent_stats_per_ip = df.groupby('IP')['Bytes Sent'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

Moving average of bytes sent

df['bytes_sent_ma'] = df['Bytes Sent'].rolling(window=10).mean()

Exponential weighted moving average of bytes sent

df['bytes_sent_ewma'] = df['Bytes Sent'].ewm(alpha=0.3).mean()

Aggregate statistics of bytes sent per session

bytes_sent_stats_per_session = df.groupby('session_id')['Bytes Sent'].agg(['sum', 'mean', 'median']).reset_index()

import numpy as np

Deeper URL Analysis

df['url_depth'] = df['path'].apply(lambda x: len(x.strip('/').split('/'))) df['avg_query_value_length'] = df['query_params'].apply(lambda x: np.mean([len(v[0]) for v in x.values()]) if x else 0) df['unique_query_keys'] = df['query_params'].apply(lambda x: len(set(x.keys())))

Session Analysis

df['unique_pages_visited_per_session'] = df.groupby(['session_id'])['path'].transform(lambda x: x.nunique()) df['avg_time_per_page'] = df.groupby(['session_id'])['time_diff'].transform(lambda x: x.mean())

Temporal Patterns

df['off_hours'] = df['hour'].apply(lambda x: 1 if x < 6 or x > 22 else 0) df['unusual_request_rate'] = df.groupby('IP')['hour'].transform(lambda x: (x - x.mean()) / x.std())

Ensure NA values are handled

df.fillna({'avg_query_value_length': 0, 'avg_time_per_page': 0, 'unusual_request_rate': 0}, inplace=True)

Manual labeling based on 'failed_attempt'

df['Anomaly'] = (df['status_4xx'] | df['status_5xx']).astype(int)

Drop redundant or unnecessary columns

columns_to_drop = ['URL', 'UserAgent', 'path', 'query_params','day_of_week', 'is_weekend', 'os'

               , 'browser', 'device_type', 'file_extension', 'hour','date','IP','Timestamp'
               ,'domain','session_id','subdirectories','User','Referrer',
               'prev_timestamp','hour_of_day','month','subdomain','Method','Status','status_4xx','status_5xx',]

df.drop(columns=columns_to_drop, inplace=True)

Data Cleaning

print("Missing values before handling:") print(df.isnull().sum())

Step 1: Identify different types of features

categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist() numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist() boolean_features = [col for col in numerical_features if set(df[col].dropna().unique()).issubset({0, 1})] numerical_features = [col for col in numerical_features if col not in boolean_features]

datetime_features = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()

Step 2: Impute missing values

Impute missing values for categorical features

for feature in categorical_features:

df[feature] = df[feature].fillna(df[feature].mode()[0])

Impute missing values for numerical features

for column in numerical_features:

df[column] = df[column].fillna(df[column].median())

Handling missing values in datetime columns

for feature in datetime_features:

df[feature] = df[feature].fillna(method='ffill')

Handling missing values in boolean columns

for column in boolean_features:

df[column] = df[column].fillna(0)

Convert boolean columns to integers

for column in boolean_features:

df[column] = df[column].astype(int)

print("DataFrame after dropping columns:") print(df.info())

print(df.isnull().sum()) print("DataFrame size:", df.shape)

#Feature Selection and Dimensionality Reduction with PCA import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve import matplotlib.pyplot as plt

numerical_features = df.select_dtypes(include=[np.number]).columns numerical_features = [col for col in numerical_features if col not in ['Anomaly']]

import seaborn as sns import matplotlib.pyplot as plt

Create a correlation matrix

correlation_matrix = df[numerical_features].corr(method='pearson') # You can also use 'spearman' if non-linear relationships are expected

Visualize the correlation matrix as a heatmap

plt.figure(figsize=(12, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title('Feature Correlation Matrix') plt.show()

Feature-Anomaly Correlation

feature_anomaly_corr = df[numerical_features].apply(lambda x: x.corr(df['Anomaly'], method='pearson'))

print("Correlation with 'Anomaly':") print(feature_anomaly_corr)

Impute missing values for numerical features

imputer = SimpleImputer(strategy='mean') df_imputed = imputer.fit_transform(df[numerical_features])

Standardize the features

df_standardized = StandardScaler().fit_transform(df_imputed)

import matplotlib.pyplot as plt

pca = PCA().fit(df_standardized) # Assuming df_standardized is your feature matrix

Plot the cumulative sum of the explained variance ratio

plt.figure(figsize=(8, 5)) plt.plot(np.cumsum(pca.explained_varianceratio)) plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.title('Explained Variance by Components') plt.grid(True) plt.show()

Apply PCA with the selected number of components

pca = PCA(n_components=35) pca_result = pca.fit_transform(df_standardized)

Getting PCA loadings

feature_names = df[numerical_features].columns pca_loadingsdf = pd.DataFrame(pca.components.T, index=feature_names, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])

print("PCA Loadings:") print(pca_loadings_df)

from keras.models import Sequential from keras.layers import LSTM, Dense

X_train, X_test, y_train, y_test = train_test_split(pca_result, df['Anomaly'], test_size=0.2, random_state=42)

Import necessary libraries

from imblearn.over_sampling import SMOTE

Apply SMOTE to training data

smote = SMOTE(k_neighbors=8) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

Isolation Forest for Anomaly Detection

iso_forest = IsolationForest(n_estimators=50, contamination=0.05, random_state=42) iso_forest.fit(X_train_resampled)

y_pred_iso = iso_forest.predict(X_test)

accuracy_iso = accuracy_score(y_test, (y_pred_iso == -1).astype(int)) print("Isolation Forest Accuracy:", accuracy_iso)

from sklearn.inspection import permutation_importance

Assuming iso_forest is your trained Isolation Forest model and X_test is your test dataset

result = permutation_importance(iso_forest, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')

Get the importance of each feature

feature_importances = result.importances_mean

Display the feature importances

for feature, importance in zip(feature_names, feature_importances):

print(f"{feature}: {importance}")

LSTM for Sequence Data

X_train_lstm = np.array(X_train_resampled).reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1])) X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.regularizers import l1_l2

model = Sequential() model.add(LSTM(20, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),

           kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)))  # Increased units, added L2 regularization

model.add(Dropout(0.3)) # Adding dropout model.add(LSTM(5, kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=64, validation_split=0.25) # Adjusted epochs, batch size, and validation split

Make predictions

y_pred_lstm_prob = model.predict(X_test_lstm) y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)

Predict anomalies using Isolation Forest

y_pred_iso = iso_forest.predict(X_test)

Convert predictions to binary labels

y_pred_iso_binary = (y_pred_iso == -1).astype(int)

Combine predictions

y_pred_combined = np.logical_or(y_pred_lstm.flatten(), y_pred_iso_binary)

Evaluation Metrics

print("Combined Performance:") print(classification_report(y_test, y_pred_combined)) print("ROC AUC score:", roc_auc_score(y_test, y_pred_combined))

Plot ROC Curve

fpr, tpr, _ = roc_curve(y_test, y_pred_combined)

plt.figure(figsize=(10, 6)) plt.plot(fpr, tpr, label='Combined ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve for Combined Model') plt.legend() plt.show()

import numpy as np import tensorflow as tf

Define a function to compute Integrated Gradients

def integrated_gradients(inputs, model, baseline=None, num_steps=50):

if baseline is None:
    baseline = np.zeros_like(inputs)

# Create a linear path from baseline to the inputs
alphas = np.linspace(0, 1, num_steps)

# Initialize an empty list to store integrated gradients
integrated_grads = []

# Convert inputs and baseline to TensorFlow tensors
inputs_tf = tf.convert_to_tensor(inputs, dtype=tf.float32)
baseline_tf = tf.convert_to_tensor(baseline, dtype=tf.float32)

# Compute gradients and integrate along the path
for alpha in alphas:
    interpolated_input = baseline_tf + alpha * (inputs_tf - baseline_tf)
    with tf.GradientTape() as tape:
        tape.watch(interpolated_input)
        predictions = model(interpolated_input)
    grads = tape.gradient(predictions, interpolated_input)
    integrated_grads.append(grads)

# Approximate the integral using the trapezoidal rule
integrated_grads = np.array(integrated_grads)
avg_grads = np.mean(integrated_grads, axis=0)
integrated_gradients = (inputs_tf - baseline_tf) * avg_grads
return integrated_gradients

Assuming X_test_lstm contains your test data

Compute Integrated Gradients for a single sample

sample_idx = 0 # Choose the index of the sample you want to explain sample = X_test_lstm[sample_idx:sample_idx+1] baseline = np.zeros_like(sample) # You can choose a different baseline if needed ig = integrated_gradients(sample, model, baseline=baseline)

Visualize the impact of features on the prediction

feature_names = numerical_features # Use your feature names here

Check the lengths of feature_names and the feature importances array

print("Length of feature_names:", len(feature_names)) print("Length of feature importances:", len(np.abs(ig[0].numpy().mean(axis=0))))

plt.figure(figsize=(10, 6)) plt.bar(feature_names, np.abs(ig[0].numpy().mean(axis=0))) plt.title('Integrated Gradients - Feature Importance') plt.xlabel('Features') plt.ylabel('Integrated Gradients') plt.xticks(rotation=45, ha='right') plt.show()

Assuming you have computed permutation importances and Integrated Gradients

from your Isolation Forest and LSTM model, respectively

Normalize permutation importances

if_max, if_min = max(feature_importances), min(feature_importances) normalized_if_importances = (feature_importances - if_min) / (if_max - if_min)

Normalize Integrated Gradients

ig_mean = np.mean(np.abs(ig[0].numpy()), axis=0) ig_max, ig_min = max(ig_mean), min(ig_mean) normalized_ig = (ig_mean - ig_min) / (ig_max - ig_min)

Combine importances

combined_importances = normalized_if_importances + normalized_ig

Visualize the combined feature importances

plt.figure(figsize=(10, 6)) plt.bar(feature_names, combined_importances) plt.title('Combined Feature Importances') plt.xlabel('Features') plt.ylabel('Combined Importances') plt.xticks(rotation=45, ha='right') plt.show()

Zip feature names and combined importances together

importance_pairs = zip(feature_names, combined_importances)

Print feature names and importances

for feature, importance in importance_pairs:

print(f'{feature}: {importance:.2f}')