ページ "Access code"
が削除されます。ご確認ください。
import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier, IsolationForest, VotingClassifier from sklearn.cluster import KMeans from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score from imblearn.over_sampling import ADASYN from imblearn.under_sampling import RandomUnderSampler from keras.models import Sequential from keras.layers import Dense from keras.callbacks import EarlyStopping from user_agents import parse from urllib.parse import urlparse, parse_qs from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
file_path = 'D:\codes\logs\Web server access logs\Web server access logs.csv' df = pd.read_csv(file_path)
print(df.info()) print(df.head())
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
df['date'] = df['Timestamp'].dt.date df['hour'] = df['Timestamp'].dt.hour
df['path'] = df['URL'].apply(lambda x: urlparse(x).path) df['query_params'] = df['URL'].apply(lambda x: parse_qs(urlparse(x).query)) df['domain'] = df['URL'].apply(lambda x: urlparse(x).netloc) df['subdomain'] = df['URL'].apply(lambda x: urlparse(x).hostname.split('.')[0] if urlparse(x).hostname else '') df['file_extension'] = df['URL'].apply(lambda x: x.split('.')[-1] if '.' in x else '') df['num_slashes'] = df['URL'].apply(lambda x: x.count('/'))
df['device_type'] = df['UserAgent'].apply(lambda x: parse(x).device.family) df['browser'] = df['UserAgent'].apply(lambda x: parse(x).browser.family) df['os'] = df['UserAgent'].apply(lambda x: parse(x).os.family)
df['day_of_week'] = df['Timestamp'].dt.day_name() df['is_weekend'] = df['Timestamp'].dt.weekday // 5
df['subdirectories'] = df['path'].apply(lambda x: x.split('/')[1:] if x.startswith('/') else []) df['url_length'] = df['URL'].apply(len) df['num_query_params'] = df['query_params'].apply(lambda x: len(x))
print(df.info())
session_timeout = pd.Timedelta(minutes=30) df = df.sort_values(['IP', 'Timestamp']) df['session_id'] = (df.groupby('IP')['Timestamp'].diff() > session_timeout).cumsum()
session_info = df.groupby(['IP', 'session_id']).agg(session_start=('Timestamp', 'min'),
session_end=('Timestamp', 'max'),
requests_per_session=('Timestamp', 'count'))
session_info['session_duration'] = (session_info['session_end'] - session_info['session_start']).dt.total_seconds()
session_info['requests_per_session'] = session_info.groupby(['IP', 'session_id'])['session_start'].transform('count')
df = df.merge(session_info[['requests_per_session']], on=['IP', 'session_id'], how='left')
df = df.merge(session_info[['session_duration']], on=['IP', 'session_id'], how='left')
df.sort_values(['IP', 'Timestamp'], inplace=True) df['prev_timestamp'] = df.groupby('IP')['Timestamp'].shift(1) df['time_diff'] = (df['Timestamp'] - df['prev_timestamp']).dt.total_seconds() df['is_repeated'] = (df['path'] == df['path'].shift(1)) & (df['IP'] == df['IP'].shift(1)) & (df['time_diff'] < 60) # 60 seconds threshold df['is_repeated'] = df['is_repeated'].astype(int)
df['status_409'] = (df['Status'] == 409).astype(int) print(df.info())
df['hour_of_day'] = df['Timestamp'].dt.hour df['day_of_week'] = df['Timestamp'].dt.dayofweek df['month'] = df['Timestamp'].dt.month
df['status_4xx'] = df['Status'].apply(lambda x: 1 if str(x).startswith('4') else 0) df['status_5xx'] = df['Status'].apply(lambda x: 1 if str(x).startswith('5') else 0) df['num_query_params'] = df['query_params'].apply(lambda x: len(x))
df['requests_per_session'] = df.groupby(['IP', 'session_id'])['IP'].transform('count') df['session_duration'] = (df.groupby(['IP', 'session_id'])['Timestamp'].transform('max') - df.groupby(['IP', 'session_id'])['Timestamp'].transform('min')).dt.total_seconds()
df['device_type'] = df['UserAgent'].apply(lambda x: parse(x).device.family) df['browser'] = df['UserAgent'].apply(lambda x: parse(x).browser.family) df['os'] = df['UserAgent'].apply(lambda x: parse(x).os.family) print(df.info())
df['requests_ma_10'] = df['requests_per_session'].rolling(window=10).mean() df['session_duration_ma_10'] = df['session_duration'].rolling(window=10).mean()
df['hour_day_interaction'] = df['hour_of_day'] * df['day_of_week']
df['avg_requests_per_ip'] = df.groupby('IP')['requests_per_session'].transform('mean') df['std_session_duration_per_ip'] = df.groupby('IP')['session_duration'].transform('std')
df['median_requests_per_ip'] = df.groupby('IP')['requests_per_session'].transform('median') df['mean_session_duration_per_ip'] = df.groupby('IP')['session_duration'].transform('mean')
df['device_browser_combination'] = df['devicetype'] + '' + df['browser']
df = df[df['session_duration'] >= 0]
print(df.info())
from scipy.stats import entropy df['method_entropy'] = df.groupby(['session_id'])['Method'].transform(lambda x: entropy(x.value_counts(normalize=True)))
df['requests_per_minute'] = df.groupby(['session_id', df['Timestamp'].dt.minute])['URL'].transform('count')
df['hour_sin'] = np.sin(df['hour_of_day'] * (2. * np.pi / 24)) df['hour_cos'] = np.cos(df['hour_of_day'] * (2. * np.pi / 24))
df['status_hour_interaction'] = df['status_409'] * df['hour_of_day'] df['requests_minute_interaction'] = df['requests_per_minute'] * df['hour_of_day']
df['endpoint_frequency'] = df['path'].map(df['path'].value_counts())
df['rolling_avg_requests'] = df['requests_per_session'].rolling(window=10).mean() df['rolling_var_requests'] = df['requests_per_session'].rolling(window=10).var()
df['ewma_requests'] = df['requests_per_session'].ewm(alpha=0.3).mean()
bytes_sent_stats_per_ip = df.groupby('IP')['Bytes Sent'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()
df['bytes_sent_ma'] = df['Bytes Sent'].rolling(window=10).mean()
df['bytes_sent_ewma'] = df['Bytes Sent'].ewm(alpha=0.3).mean()
bytes_sent_stats_per_session = df.groupby('session_id')['Bytes Sent'].agg(['sum', 'mean', 'median']).reset_index()
import numpy as np
df['url_depth'] = df['path'].apply(lambda x: len(x.strip('/').split('/'))) df['avg_query_value_length'] = df['query_params'].apply(lambda x: np.mean([len(v[0]) for v in x.values()]) if x else 0) df['unique_query_keys'] = df['query_params'].apply(lambda x: len(set(x.keys())))
df['unique_pages_visited_per_session'] = df.groupby(['session_id'])['path'].transform(lambda x: x.nunique()) df['avg_time_per_page'] = df.groupby(['session_id'])['time_diff'].transform(lambda x: x.mean())
df['off_hours'] = df['hour'].apply(lambda x: 1 if x < 6 or x > 22 else 0) df['unusual_request_rate'] = df.groupby('IP')['hour'].transform(lambda x: (x - x.mean()) / x.std())
df.fillna({'avg_query_value_length': 0, 'avg_time_per_page': 0, 'unusual_request_rate': 0}, inplace=True)
df['Anomaly'] = (df['status_4xx'] | df['status_5xx']).astype(int)
columns_to_drop = ['URL', 'UserAgent', 'path', 'query_params','day_of_week', 'is_weekend', 'os'
, 'browser', 'device_type', 'file_extension', 'hour','date','IP','Timestamp'
,'domain','session_id','subdirectories','User','Referrer',
'prev_timestamp','hour_of_day','month','subdomain','Method','Status','status_4xx','status_5xx',]
df.drop(columns=columns_to_drop, inplace=True)
print("Missing values before handling:") print(df.isnull().sum())
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist() numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist() boolean_features = [col for col in numerical_features if set(df[col].dropna().unique()).issubset({0, 1})] numerical_features = [col for col in numerical_features if col not in boolean_features]
datetime_features = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
for feature in categorical_features:
df[feature] = df[feature].fillna(df[feature].mode()[0])
for column in numerical_features:
df[column] = df[column].fillna(df[column].median())
for feature in datetime_features:
df[feature] = df[feature].fillna(method='ffill')
for column in boolean_features:
df[column] = df[column].fillna(0)
for column in boolean_features:
df[column] = df[column].astype(int)
print("DataFrame after dropping columns:") print(df.info())
print(df.isnull().sum()) print("DataFrame size:", df.shape)
#Feature Selection and Dimensionality Reduction with PCA import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve import matplotlib.pyplot as plt
numerical_features = df.select_dtypes(include=[np.number]).columns numerical_features = [col for col in numerical_features if col not in ['Anomaly']]
import seaborn as sns import matplotlib.pyplot as plt
correlation_matrix = df[numerical_features].corr(method='pearson') # You can also use 'spearman' if non-linear relationships are expected
plt.figure(figsize=(12, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title('Feature Correlation Matrix') plt.show()
feature_anomaly_corr = df[numerical_features].apply(lambda x: x.corr(df['Anomaly'], method='pearson'))
print("Correlation with 'Anomaly':") print(feature_anomaly_corr)
imputer = SimpleImputer(strategy='mean') df_imputed = imputer.fit_transform(df[numerical_features])
df_standardized = StandardScaler().fit_transform(df_imputed)
import matplotlib.pyplot as plt
pca = PCA().fit(df_standardized) # Assuming df_standardized is your feature matrix
plt.figure(figsize=(8, 5)) plt.plot(np.cumsum(pca.explained_varianceratio)) plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.title('Explained Variance by Components') plt.grid(True) plt.show()
pca = PCA(n_components=35) pca_result = pca.fit_transform(df_standardized)
feature_names = df[numerical_features].columns pca_loadingsdf = pd.DataFrame(pca.components.T, index=feature_names, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
print("PCA Loadings:") print(pca_loadings_df)
from keras.models import Sequential from keras.layers import LSTM, Dense
X_train, X_test, y_train, y_test = train_test_split(pca_result, df['Anomaly'], test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=8) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
iso_forest = IsolationForest(n_estimators=50, contamination=0.05, random_state=42) iso_forest.fit(X_train_resampled)
y_pred_iso = iso_forest.predict(X_test)
accuracy_iso = accuracy_score(y_test, (y_pred_iso == -1).astype(int)) print("Isolation Forest Accuracy:", accuracy_iso)
from sklearn.inspection import permutation_importance
result = permutation_importance(iso_forest, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')
feature_importances = result.importances_mean
for feature, importance in zip(feature_names, feature_importances):
print(f"{feature}: {importance}")
X_train_lstm = np.array(X_train_resampled).reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1])) X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.regularizers import l1_l2
model = Sequential() model.add(LSTM(20, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),
kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) # Increased units, added L2 regularization
model.add(Dropout(0.3)) # Adding dropout model.add(LSTM(5, kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=64, validation_split=0.25) # Adjusted epochs, batch size, and validation split
y_pred_lstm_prob = model.predict(X_test_lstm) y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
y_pred_iso = iso_forest.predict(X_test)
y_pred_iso_binary = (y_pred_iso == -1).astype(int)
y_pred_combined = np.logical_or(y_pred_lstm.flatten(), y_pred_iso_binary)
print("Combined Performance:") print(classification_report(y_test, y_pred_combined)) print("ROC AUC score:", roc_auc_score(y_test, y_pred_combined))
fpr, tpr, _ = roc_curve(y_test, y_pred_combined)
plt.figure(figsize=(10, 6)) plt.plot(fpr, tpr, label='Combined ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve for Combined Model') plt.legend() plt.show()
import numpy as np import tensorflow as tf
def integrated_gradients(inputs, model, baseline=None, num_steps=50):
if baseline is None:
baseline = np.zeros_like(inputs)
# Create a linear path from baseline to the inputs
alphas = np.linspace(0, 1, num_steps)
# Initialize an empty list to store integrated gradients
integrated_grads = []
# Convert inputs and baseline to TensorFlow tensors
inputs_tf = tf.convert_to_tensor(inputs, dtype=tf.float32)
baseline_tf = tf.convert_to_tensor(baseline, dtype=tf.float32)
# Compute gradients and integrate along the path
for alpha in alphas:
interpolated_input = baseline_tf + alpha * (inputs_tf - baseline_tf)
with tf.GradientTape() as tape:
tape.watch(interpolated_input)
predictions = model(interpolated_input)
grads = tape.gradient(predictions, interpolated_input)
integrated_grads.append(grads)
# Approximate the integral using the trapezoidal rule
integrated_grads = np.array(integrated_grads)
avg_grads = np.mean(integrated_grads, axis=0)
integrated_gradients = (inputs_tf - baseline_tf) * avg_grads
return integrated_gradients
sample_idx = 0 # Choose the index of the sample you want to explain sample = X_test_lstm[sample_idx:sample_idx+1] baseline = np.zeros_like(sample) # You can choose a different baseline if needed ig = integrated_gradients(sample, model, baseline=baseline)
feature_names = numerical_features # Use your feature names here
print("Length of feature_names:", len(feature_names)) print("Length of feature importances:", len(np.abs(ig[0].numpy().mean(axis=0))))
plt.figure(figsize=(10, 6)) plt.bar(feature_names, np.abs(ig[0].numpy().mean(axis=0))) plt.title('Integrated Gradients - Feature Importance') plt.xlabel('Features') plt.ylabel('Integrated Gradients') plt.xticks(rotation=45, ha='right') plt.show()
if_max, if_min = max(feature_importances), min(feature_importances) normalized_if_importances = (feature_importances - if_min) / (if_max - if_min)
ig_mean = np.mean(np.abs(ig[0].numpy()), axis=0) ig_max, ig_min = max(ig_mean), min(ig_mean) normalized_ig = (ig_mean - ig_min) / (ig_max - ig_min)
combined_importances = normalized_if_importances + normalized_ig
plt.figure(figsize=(10, 6)) plt.bar(feature_names, combined_importances) plt.title('Combined Feature Importances') plt.xlabel('Features') plt.ylabel('Combined Importances') plt.xticks(rotation=45, ha='right') plt.show()
importance_pairs = zip(feature_names, combined_importances)
for feature, importance in importance_pairs:
print(f'{feature}: {importance:.2f}')
ページ "Access code"
が削除されます。ご確認ください。