nilekhet
/

Spectrum

Model card Files Files and versions Community

nilekhet commited on Apr 29, 2023

Commit

b743670

1 Parent(s): 081f8b3

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
labelData.csv +10 -0
malware_classifier_lime.h5 +3 -0
model_training.py +131 -0
predict.py +61 -0
spectrum.tar +3 -0
validate.py +50 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+spectrum.tar filter=lfs diff=lfs merge=lfs -text

labelData.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Login successful for user admin,0
+User admin added a new user john_doe,0
+Failed login attempt from IP 192.168.1.55,1
+User john_doe deleted file important_document.txt,0
+New user jane_doe registered,0
+Unauthorized access from IP 10.0.0.27,1
+User admin updated file financial_report.xlsx,0
+Failed login attempt from IP 192.168.1.56,1
+User jane_doe uploaded file meeting_notes.docx,0
+Unauthorized access from IP 10.0.0.28,1

malware_classifier_lime.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b23d9ceac1ec5ac69287c9a597d51586a017dcc525aede8a6b0953ba0597ca56
+size 53156780

model_training.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#the goal of this script is to train the model and then save it
+import os
+import tensorflow as tf
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, Flatten, Dense, Dropout
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.image import load_img, img_to_array
+import shutil
+from PIL import Image
+from tensorflow.keras.preprocessing.image import load_img, img_to_array
+import matplotlib.pyplot as plt
+import cv2
+#import seaborn as sns
+import numpy as np
+import pickle
+def clean_directory(directory, cache_file="cache.pkl"):
+    if os.path.exists(cache_file):
+        with open(cache_file, "rb") as f:
+            num_classes = pickle.load(f)
+            print("Loaded cached results.")
+            return num_classes
+    num_classes = 0
+    for subdir, dirs, files in os.walk(directory):
+        if not dirs:
+            num_classes += 1
+            valid_files = []
+            for file in files:
+                file_path = os.path.join(subdir, file)
+                try:
+                    img = Image.open(file_path)
+                    img.verify()  # Verify if the image is not corrupted
+                    valid_files.append(file)
+                except (IOError, SyntaxError) as e:
+                    print(f"Removing corrupted file: {file_path}")
+                    os.remove(file_path)
+            # Remove empty directories
+            if not valid_files:
+                print(f"Removing empty directory: {subdir}")
+                shutil.rmtree(subdir)
+                num_classes -= 1
+    # Save the results in cache
+    with open(cache_file, "wb") as f:
+        pickle.dump(num_classes, f)
+        print("Saved results to cache.")
+    return num_classes
+data_dir = 'Malign/extract'
+num_classes = clean_directory(data_dir)
+# Parameters
+batch_size = 32
+epochs = 50
+image_size = (200, 200)  # Set the desired image size for input to the model
+# Data preprocessing
+train_datagen = ImageDataGenerator(
+    rescale=1./255,
+    validation_split=0.2  # Split 20% of data for validation
+)
+train_generator = train_datagen.flow_from_directory(
+    data_dir,
+    target_size=image_size,
+    batch_size=batch_size,
+    class_mode='categorical',
+    subset='training'
+)
+validation_generator = train_datagen.flow_from_directory(
+    data_dir,
+    target_size=image_size,
+    batch_size=batch_size,
+    class_mode='categorical',
+    subset='validation'
+)
+# Model creation
+model = Sequential()
+# First convolution layer
+model.add(Conv2D(64, (3, 3), input_shape=(*image_size, 3)))
+model.add(Activation('relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+# Second convolution layer
+model.add(Conv2D(64, (3, 3)))
+model.add(Activation('relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+# Third convolution layer
+model.add(Conv2D(64, (3, 3)))
+model.add(Activation('relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+# Fully connected layers
+model.add(Flatten())
+model.add(Dense(128))
+model.add(Dropout(0.5))
+model.add(Activation('relu'))
+# Output layer
+model.add(Dense(119))
+model.add(Activation('softmax'))
+model.summary()
+model.compile(
+    optimizer=Adam(learning_rate=0.001),
+    loss='categorical_crossentropy',
+    metrics=['accuracy']
+)
+# Model training
+history = model.fit(
+    train_generator,
+    epochs=epochs,
+    validation_data=validation_generator
+)
+# Save the trained model
+model.save("malware_classifier_lime.h5")

predict.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import sys
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.image import load_img, img_to_array
+import matplotlib.pyplot as plt
+from lime import lime_image
+from skimage.segmentation import mark_boundaries
+def explain_instance(image_path, model, num_features=5, num_samples=1000):
+    img = load_img(image_path, target_size=image_size)
+    img_array = img_to_array(img) / 255
+    explanation = explainer.explain_instance(img_array, model.predict, top_labels=num_classes, hide_color=0,
+                                             num_samples=num_samples, num_features=num_features)
+    return explanation
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: predict.py image_path")
+        sys.exit(1)
+    image_path = sys.argv[1]
+    image_size = (200, 200)
+    model_path = "malware_classifier_lime.h5"
+    model = load_model(model_path)
+    num_classes = 119
+    explainer = lime_image.LimeImageExplainer()
+    explanation = explain_instance(image_path, model)
+    temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)
+    img = load_img(image_path, target_size=image_size)
+    img_array = img_to_array(img) / 255
+    # Display the original image
+    plt.figure(figsize=(10, 5))
+    plt.subplot(1, 2, 1)
+    plt.imshow(img_array)
+    plt.title("Original Image")
+    plt.axis("off")
+    # Display the LIME explanation
+    plt.subplot(1, 2, 2)
+    plt.imshow(mark_boundaries(temp, mask))
+    plt.title("LIME Explanation")
+    plt.axis("off")
+    plt.show()
+    # Make a prediction
+    img = load_img(image_path, target_size=image_size)
+    img_array = img_to_array(img) / 255
+    img_array = np.expand_dims(img_array, axis=0)
+    prediction = model.predict(img_array)
+    predicted_class = np.argmax(prediction)
+    # Get the class name
+    class_name = list(train_generator.class_indices.keys())[list(train_generator.class_indices.values()).index(predicted_class)]
+    print(f"Predicted class: {predicted_class}, Class name: {class_name}")

spectrum.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:485d96d9d396c57cde7433181d3645ebe5f87155972921a50ee101f9882a515d
+size 542136320

validate.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.keras.models import load_model
+from sklearn.metrics import classification_report, confusion_matrix
+import pickle
+# Load the saved model
+model = load_model("malware_classifier_lime.h5")
+data_dir = 'Malign/extract'
+# Load the number of classes from the cache file
+with open("cache.pkl", "rb") as f:
+    num_classes = pickle.load(f)
+# Parameters
+batch_size = 32
+image_size = (200, 200)
+# Data preprocessing
+test_datagen = ImageDataGenerator(rescale=1./255)
+test_generator = test_datagen.flow_from_directory(
+    data_dir,
+    target_size=image_size,
+    batch_size=batch_size,
+    class_mode='categorical',
+    shuffle=False
+)
+# Evaluate the model
+print("Evaluating the model...")
+score = model.evaluate(test_generator)
+print("Loss: ", score[0])
+print("Accuracy: ", score[1])
+# Predict the class labels
+print("Predicting the class labels...")
+y_pred = model.predict(test_generator)
+y_pred_classes = np.argmax(y_pred, axis=1)
+# Classification report
+print("Classification report:")
+print(classification_report(test_generator.classes, y_pred_classes, target_names=test_generator.class_indices.keys()))
+# Confusion matrix
+print("Confusion matrix:")
+print(confusion_matrix(test_generator.classes, y_pred_classes))