Будьте внимательны! Это приведет к удалению страницы «Webserver Audit Code»
.
import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve from keras.models import Model from keras.layers import LSTM import matplotlib.pyplot as plt
import pandas as pd import numpy as np import re from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import IsolationForest from sklearn.svm import OneClassSVM from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.metrics import silhouette_score from sklearn.covariance import EllipticEnvelope import matplotlib.pyplot as plt import seaborn as sns from mlxtend.frequent_patterns import apriori, fpmax, association_rules import pandas as pd import numpy as np import re from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import IsolationForest from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import f1_score, make_scorer from sklearn.decomposition import PCA import matplotlib.pyplot as plt from sklearn.feature_selection import VarianceThreshold from sklearn.impute import SimpleImputer import keras from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, f1_score from imblearn.over_sampling import ADASYN
file_path = r'D:/codes/logs/Nextcloud Admin Audit Logs/Nextcloud Admin Audit Logs.csv' df = pd.read_csv(file_path)
print(df.info()) print(df.head())
print("Missing values before handling:") print(df.isnull().sum())
df.dropna(inplace=True)
print("Missing values after handling:") print(df.isnull().sum()) print("DataFrame size:", df.shape)
df.dropna(subset=['remoteAddr', 'method'], inplace=True)
df['time'] = pd.to_datetime(df['time'], errors='coerce', utc=True)
df['date'] = df['time'].dt.date df['hour'] = df['time'].dt.hour df['day_of_week'] = df['time'].dt.dayofweek
df['is_business_hour'] = ((df['hour'] >= 9) & (df['hour'] <= 17)).astype(int) df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['login_attempt'] = df['message'].apply(lambda x: re.search(r'Login attempt: "(.?)"', str(x)).group(1) if re.search(r'Login attempt: "(.?)"', str(x)) else None) df['login_successful'] = df['message'].apply(lambda x: re.search(r'Login successful: "(.?)"', str(x)).group(1) if re.search(r'Login successful: "(.?)"', str(x)) else None) df['failed_attempt'] = df['login_attempt'].notnull().astype(int)
df['file_path'] = df['url'].apply(lambda x: re.sub(r'\?.', '', x) if isinstance(x, str) else None) df['directory'] = df['file_path'].apply(lambda x: re.search(r'/(.?)/', str(x)).group(1) if re.search(r'/(.?)/', str(x)) else None) df['file_name'] = df['file_path'].apply(lambda x: re.search(r'/([^/]+)$', str(x)).group(1) if re.search(r'/([^/]+)$', str(x)) else None) df['file_extension'] = df['file_name'].apply(lambda x: re.search(r'.(.?)$', str(x)).group(1) if re.search(r'.(.?)$', str(x)) else None) df['file_type'] = df['url'].apply(lambda x: re.search(r'.(.?)$', str(x)).group(1) if re.search(r'.(.*?)$', str(x)) else None)
def create_user_profile(row):
if row['failed_attempt'] == 1:
return 'Failed Login Attempt'
else:
return 'Successful Login'
df['user_profile'] = df.apply(create_user_profile, axis=1)
df.sort_values(by=['user', 'time'], inplace=True) df['time_diff'] = df.groupby('user')['time'].diff().dt.total_seconds() / 60 df['new_session'] = (df['time_diff'] > 1800).astype(int) df['session_id'] = df.groupby('user')['new_session'].cumsum() df['session_time_diff'] = df['time_diff'].where(df['new_session'] == 0, other=0)
df['session_duration'] = df.groupby(['user', 'session_id'])['time'].transform(lambda x: (x.max() - x.min()).total_seconds())
df['previous_login'] = df.sort_values(by=['user', 'time']).groupby('user')['time'].shift(1)
df['login_attempts_rolling'] = df.groupby('user')['failed_attempt'].rolling(window=24*60).sum().reset_index(drop=True)
df['moving_avg_failed_attempts'] = df.groupby('user')['failed_attempt'].rolling(window=24*60).mean().reset_index(drop=True)
method_counts = df['method'].value_counts() put_operations = df[df['method'] == 'PUT'] other_methods = df[(df['method'] != 'PUT') & (df['method'] != 'GET')] # Excluding GET requests
df['session_duration_trend'] = df.groupby('user')['session_duration'].transform(lambda x: x.diff().rolling(window=3).mean())
df['session_duration_percentile_rank'] = df.groupby('user')['session_duration'].transform(lambda x: x.rank(pct=True))
df['directory_access_count'] = df.groupby(['user', 'directory'])['time'].transform('count')
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7) df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['session_duration_mean'] = df.groupby('user')['session_duration'].transform('mean') df['session_duration_median'] = df.groupby('user')['session_duration'].transform('median') df['session_duration_variance'] = df.groupby('user')['session_duration'].transform('var')
df['avg_session_duration_by_profile_directory'] = df.groupby(['user_profile', 'directory'])['session_duration'].transform('mean')
df['sequence_encoding'] = df.groupby('user')['directory'].transform(lambda x: pd.factorize(x)[0])
df['time_interval_between_requests'] = df.groupby('user')['time'].diff().dt.total_seconds()
df['session_request_count'] = df.groupby(['user', 'session_id'])['time'].transform('count')
df['login_attempt_session'] = (df['login_attempt'].notnull() & (df['new_session'] == 0)).astype(int)
df['file_access_session'] = (df['file_path'].notnull() & (df['new_session'] == 0)).astype(int)
df['action_type'] = df['method'].apply(lambda x: 'Login Attempt' if x == 'POST' else 'File Access') df['file_or_directory'] = df['url'].apply(lambda x: x.split('/')[-2] if len(x.split('/')) > 1 else None) df['request_method'] = df['method']
df['login_attempt_and_file_access'] = ((df['method'] == 'POST') & (df['file_name'].notnull())).astype(int) df['file_access_and_method_PUT'] = ((df['method'] == 'PUT') & (df['file_name'].notnull())).astype(int)
login_threshold = 10 # Example threshold for daily logins df['excessive_logins'] = df['login_attempts_rolling'] > login_threshold
df['excessive_logins'] = df['excessive_logins'].astype(int)
ip_freq = df.groupby(['remoteAddr', 'date']).size() threshold = 25000 # Define your own threshold based on context high_freq_ips = ip_freq[ip_freq > threshold].index.to_list()
df['high_freq_access'] = df.apply(lambda x: (x['remoteAddr'], x['date']) in high_freq_ips, axis=1).astype(int)
df['Anomaly'] = df['failed_attempt']
print("Missing values before handling:") print(df.isnull().sum())
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist() numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist() boolean_features = [col for col in numerical_features if set(df[col].dropna().unique()).issubset({0, 1})] numerical_features = [col for col in numerical_features if col not in boolean_features]
datetime_features = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
for feature in categorical_features:
df[feature] = df[feature].fillna(df[feature].mode()[0])
for column in numerical_features:
df[column] = df[column].fillna(df[column].median())
for feature in datetime_features:
df[feature] = df[feature].fillna(method='ffill')
for column in boolean_features:
df[column] = df[column].fillna(0)
for column in boolean_features:
df[column] = df[column].astype(int)
print("Missing values after handling:") print(df.isnull().sum()) columns_to_drop = ['userAgent', 'version','reqId','time' , 'remoteAddr', 'user','app','level', 'login_successful', 'hour' , 'date','url' , 'file_name','session_id'
,'day_of_week','file_extension','is_business_hour','file_path','directory','user_profile','new_session','file_type','failed_attempt','action_type',
'request_method','method','message','previous_login' ,'session_duration','previous_login' ]
df = df.drop(columns=columns_to_drop, errors='ignore')
print("DataFrame after dropping columns:") print(df.info())
print(df.isnull().sum()) print("DataFrame size:", df.shape)
#Feature Selection and Dimensionality Reduction with PCA import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve import matplotlib.pyplot as plt
numerical_features = df.select_dtypes(include=[np.number]).columns numerical_features = [col for col in numerical_features if col not in ['Anomaly']]
import seaborn as sns import matplotlib.pyplot as plt
correlation_matrix = df[numerical_features].corr(method='pearson') # You can also use 'spearman' if non-linear relationships are expected
plt.figure(figsize=(12, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title('Feature Correlation Matrix') plt.show()
feature_anomaly_corr = df[numerical_features].apply(lambda x: x.corr(df['Anomaly'], method='pearson'))
print("Correlation with 'Anomaly':") print(feature_anomaly_corr)
imputer = SimpleImputer(strategy='mean') df_imputed = imputer.fit_transform(df[numerical_features])
df_standardized = StandardScaler().fit_transform(df_imputed)
import matplotlib.pyplot as plt
pca = PCA().fit(df_standardized) # Assuming df_standardized is your feature matrix
plt.figure(figsize=(8, 5)) plt.plot(np.cumsum(pca.explained_varianceratio)) plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.title('Explained Variance by Components') plt.grid(True) plt.show()
pca = PCA(n_components=25) pca_result = pca.fit_transform(df_standardized)
feature_names = df[numerical_features].columns pca_loadingsdf = pd.DataFrame(pca.components.T, index=feature_names, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
print("PCA Loadings:") print(pca_loadings_df)
from keras.models import Sequential from keras.layers import LSTM, Dense
X_train, X_test, y_train, y_test = train_test_split(pca_result, df['Anomaly'], test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=7) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
iso_forest = IsolationForest(n_estimators=50, contamination=0.05, random_state=42) iso_forest.fit(X_train_resampled)
y_pred_iso = iso_forest.predict(X_test)
accuracy_iso = accuracy_score(y_test, (y_pred_iso == -1).astype(int)) print("Isolation Forest Accuracy:", accuracy_iso)
from sklearn.inspection import permutation_importance
result = permutation_importance(iso_forest, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')
feature_importances = result.importances_mean
for feature, importance in zip(feature_names, feature_importances):
print(f"{feature}: {importance}")
X_train_lstm = np.array(X_train_resampled).reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1])) X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.regularizers import l1_l2
model = Sequential() model.add(LSTM(10, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),
kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) # Increased units, added L2 regularization
model.add(Dropout(0.2)) # Adding dropout model.add(LSTM(5, kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=64, validation_split=0.25)
y_pred_lstm_prob = model.predict(X_test_lstm) y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
y_pred_iso = iso_forest.predict(X_test)
y_pred_iso_binary = (y_pred_iso == -1).astype(int)
y_pred_combined = np.logical_or(y_pred_lstm.flatten(), y_pred_iso_binary)
print("Combined Performance:") print(classification_report(y_test, y_pred_combined)) print("ROC AUC score:", roc_auc_score(y_test, y_pred_combined))
fpr, tpr, _ = roc_curve(y_test, y_pred_combined)
plt.figure(figsize=(10, 6)) plt.plot(fpr, tpr, label='Combined ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve for Combined Model') plt.legend() plt.show()
import numpy as np import tensorflow as tf
def integrated_gradients(inputs, model, baseline=None, num_steps=50):
if baseline is None:
baseline = np.zeros_like(inputs)
# Create a linear path from baseline to the inputs
alphas = np.linspace(0, 1, num_steps)
# Initialize an empty list to store integrated gradients
integrated_grads = []
# Convert inputs and baseline to TensorFlow tensors
inputs_tf = tf.convert_to_tensor(inputs, dtype=tf.float32)
baseline_tf = tf.convert_to_tensor(baseline, dtype=tf.float32)
# Compute gradients and integrate along the path
for alpha in alphas:
interpolated_input = baseline_tf + alpha * (inputs_tf - baseline_tf)
with tf.GradientTape() as tape:
tape.watch(interpolated_input)
predictions = model(interpolated_input)
grads = tape.gradient(predictions, interpolated_input)
integrated_grads.append(grads)
# Approximate the integral using the trapezoidal rule
integrated_grads = np.array(integrated_grads)
avg_grads = np.mean(integrated_grads, axis=0)
integrated_gradients = (inputs_tf - baseline_tf) * avg_grads
return integrated_gradients
sample_idx = 0 # Choose the index of the sample you want to explain sample = X_test_lstm[sample_idx:sample_idx+1] baseline = np.zeros_like(sample) # You can choose a different baseline if needed ig = integrated_gradients(sample, model, baseline=baseline)
feature_names = numerical_features # Use your feature names here
print("Length of feature_names:", len(feature_names)) print("Length of feature importances:", len(np.abs(ig[0].numpy().mean(axis=0))))
plt.figure(figsize=(10, 6)) plt.bar(feature_names, np.abs(ig[0].numpy().mean(axis=0))) plt.title('Integrated Gradients - Feature Importance') plt.xlabel('Features') plt.ylabel('Integrated Gradients') plt.xticks(rotation=45, ha='right') plt.show()
if_max, if_min = max(feature_importances), min(feature_importances) normalized_if_importances = (feature_importances - if_min) / (if_max - if_min)
ig_mean = np.mean(np.abs(ig[0].numpy()), axis=0) ig_max, ig_min = max(ig_mean), min(ig_mean) normalized_ig = (ig_mean - ig_min) / (ig_max - ig_min)
combined_importances = normalized_if_importances + normalized_ig
plt.figure(figsize=(10, 6)) plt.bar(feature_names, combined_importances) plt.title('Combined Feature Importances') plt.xlabel('Features') plt.ylabel('Combined Importances') plt.xticks(rotation=45, ha='right') plt.show()
importance_pairs = zip(feature_names, combined_importances)
for feature, importance in importance_pairs:
print(f'{feature}: {importance:.2f}')
Будьте внимательны! Это приведет к удалению страницы «Webserver Audit Code»
.