Category: Artificial Intelligence

  • Deep Learning

    Deep Learning (DL)

    Deep Learning (DL) is a subset of Machine Learning that utilizes artificial neural networks with multiple layers to identify complex patterns in data. Unlike traditional machine learning methods, which often depend on manually engineered features, deep learning models automatically learn hierarchical representations directly from raw data. This capability makes them particularly effective for processing unstructured data, such as images, audio, text, and video. Deep learning is widely applied in various fields, including image recognition, speech processing, natural language processing, autonomous systems, and cybersecurity, where large-scale and complex data need to be analyzed efficiently.

    Process

    • Input (raw data)
    • Hidden layers (learn low-level -> high-level features automatically)
    • Output (prediction / classification)

    Example (Addition)

    import numpy as np # For numerical operations and generating random data
    from tensorflow.keras.models import Sequential # For building a sequential neural network
    from tensorflow.keras.layers import LSTM, Dense, Dropout, SpatialDropout1D, Embedding # Neural network layers
    from keras.callbacks import EarlyStopping # Stop training early if model stops improving

    # Generate random input data
    x = np.random.randint(0, 500, size=(1000,2)) # 1000 samples, 2 features each (random integers 0-499)
    y = x[:, 0] + x[:, 1] # Target is sum of two features

    # Build a simple neural network
    model = Sequential() # Initialize sequential model
    model.add(Dense(32, input_shape=(2,), activation=’relu’)) # Hidden layer with 32 neurons, ReLU activation
    model.add(Dense(1)) # Output layer with 1 neuron (predict sum)

    # Compile the model
    model.compile(loss=’mean_absolute_error’, optimizer=’adam’, metrics=[‘mae’]) # Use MAE loss and Adam optimizer

    # Train the model
    model.fit(
        x, y, # Training data and targets
        validation_split=0.2, # Use 20% of data for validation
        batch_size=32, # Batch size for training
        epochs=100, # Maximum number of epochs
        verbose=1, # Show progress
        callbacks=[EarlyStopping(monitor=’val_loss’, patience=5)] # Stop early if validation loss doesn’t improve for 5 epochs
    )

    # Predict on new data
    print(model.predict(np.array([[0.2, 10], [50, 1]]))) # Predict sum for two new samples

    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D,Embedding
    from keras.callbacks import EarlyStopping

    x = np.random.randint(0, 500, size=(1000,2))
    y = x[:, 0] + x[:, 1]

    model = Sequential()
    model.add(Dense(32, input_shape=(2,), activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])

    # ~1000 samples, batch size 32 (hyperparameter)
    # For fixed validation, use train_test_split instead of validation_split
    model.fit(x, y, validation_split=0.2, batch_size=32, epochs=100, verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    print(model.predict(np.array([[0.2, 10], [50, 1]])))

    Example (Multiplication)

    import numpy as np # For creating and handling arrays
    from tensorflow.keras.models import Sequential # For building a sequential neural network
    from tensorflow.keras.layers import LSTM, Dense, Dropout, SpatialDropout1D, Embedding # Neural network layers
    from keras.callbacks import EarlyStopping # Stop training early if validation loss stops improving

    # Generate random input data
    x = np.random.randint(0, 10, size=(1000,2)) # 1000 samples, each with 2 features (integers 0-9)
    y = x[:, 0] * x[:, 1] # Target = multiplication of the two features

    # Build the neural network
    model = Sequential() # Initialize sequential model
    model.add(Dense(64, input_shape=(2,), activation=’relu’)) # First hidden layer with 64 neurons, ReLU activation
    model.add(Dense(64, activation=’relu’)) # Second hidden layer with 64 neurons, ReLU activation
    model.add(Dense(1)) # Output layer with 1 neuron (predict the product)

    # Compile the model
    model.compile(loss=’mean_absolute_error’, optimizer=’adam’, metrics=[‘mae’]) # MAE loss for regression, Adam optimizer

    # Train the model
    model.fit(
        x, y, # Training data and targets
        validation_split=0.2, # Use 20% of data for validation
        batch_size=32, # Batch size
        epochs=100, # Maximum number of epochs
        verbose=1, # Show progress bar
        callbacks=[EarlyStopping(monitor=’val_loss’, patience=5)] # Stop early if validation loss does not improve for 5 epochs
    )

    # Predict new data
    print(model.predict(np.array([[2, 3]]))) # Predict the product of 2*3

    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D,Embedding
    from keras.callbacks import EarlyStopping

    x = np.random.randint(0, 10, size=(1000,2))
    y = x[:, 0] * x[:, 1]

    model = Sequential()
    model.add(Dense(64, input_shape=(2,), activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])

    # ~1000 samples, batch size 32 (hyperparameter)
    # For fixed validation, use train_test_split instead of validation_split
    model.fit(x, y, validation_split=0.2, batch_size=32, epochs=100, verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    print(model.predict(np.array([[2, 3]])))

    Predicting Suspicious Emails (phishing)

    import numpy as np # Numerical operations (not heavily used here but commonly included)
    from tensorflow.keras.models import Sequential # Sequential model (stack layers linearly)
    from tensorflow.keras.layers import Dense # Fully connected (dense) neural network layers
    from sklearn.feature_extraction.text import CountVectorizer # Converts text into numeric feature vectors (bag-of-words)
    emails = [
        “Click here to reset your password”, # Likely phishing example
        “Your invoice is attached”, # Likely safe example
        “Verify your bank account immediately”, # Likely phishing example
        “Meeting tomorrow at 10am”, # Likely safe example
    ]
    labels = [1, 0, 1, 0] # Target labels: 1 = phishing, 0 = safe
    vectorizer = CountVectorizer() # Initialize text vectorizer (bag-of-words model)
    features = vectorizer.fit_transform(emails).toarray() # Learn vocabulary + convert emails into numeric feature matrix
    model = Sequential() # Create a sequential neural network model
    model.add(Dense(32, input_shape=(features.shape[1],), activation=’relu’)) # Input layer + first hidden layer (32 neurons)
    model.add(Dense(16, activation=’relu’)) # Second hidden layer (16 neurons)
    model.add(Dense(1, activation=’sigmoid’)) # Output layer (1 neuron for binary classification, sigmoid = probability)
    model.compile(optimizer=’adam’, loss=’binary_crossentropy’, metrics=[‘accuracy’]) # Configure model training settings
    model.fit(features, labels, epochs=50, verbose=0) # Train the model for 50 iterations (epochs), no training output shown
    new_emails = vectorizer.transform([
        “Your account will be locked, click here”, # Suspicious/phishing-like message
        “Lunch tomorrow?” # Normal/safe message
    ]).toarray() # Convert new emails into the same feature format
    prediction = model.predict(new_emails) > 0.5 # Predict probabilities and convert to True/False using threshold 0.5
    print(“Phishing predictions (True=Phishing, False=Safe):”, prediction) # Display prediction results

    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from sklearn.feature_extraction.text import CountVectorizer

    emails = [
        "Click here to reset your password",
        "Your invoice is attached",
        "Verify your bank account immediately",
        "Meeting tomorrow at 10am",
    ]

    labels = [1, 0, 1, 0]  # 1 = phishing, 0 = safe

    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(emails).toarray()

    model = Sequential()
    model.add(Dense(32, input_shape=(features.shape[1],), activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(features, labels, epochs=50, verbose=0)

    new_emails = vectorizer.transform([
        "Your account will be locked, click here",
        "Lunch tomorrow?"
    ]).toarray()

    prediction = model.predict(new_emails) > 0.5
    print("Phishing predictions (True=Phishing, False=Safe):", prediction)

    Predicting Suspicious Files (Malware)

    import numpy as np # Library for numerical operations and arrays
    from tensorflow.keras.models import Sequential # Sequential model to stack layers
    from tensorflow.keras.layers import Dense # Fully connected neural network layers
    x = np.random.randint(0, 100, size=(1000, 3)) # Generate 1000 samples, each with 3 random features (0–99)
    y = (x[:,0] + x[:,1] + x[:,2] > 150).astype(int) # Label: 1 (malware) if sum > 150, else 0 (safe)
    model = Sequential() # Initialize the neural network model
    model.add(Dense(32, input_shape=(3,), activation=’relu’)) # Input layer + first hidden layer (32 neurons, ReLU activation)
    model.add(Dense(16, activation=’relu’)) # Second hidden layer (16 neurons)
    model.add(Dense(1, activation=’sigmoid’)) # Output layer (1 neuron, sigmoid for binary classification)
    model.compile(optimizer=’adam’, loss=’binary_crossentropy’, metrics=[‘accuracy’]) # Configure model with optimizer, loss, and accuracy metric
    model.fit(x, y, epochs=50, batch_size=32, verbose=0)  # Train the model for 50 epochs with batch size of 32
    new_files = np.array([[60, 50, 50], [10, 5, 15]]) # New data samples to classify (each has 3 features)
    prediction = model.predict(new_files) > 0.5 # Predict probabilities and convert to True/False using threshold 0.5
    print(“Malware predictions (True=Malware, False=Safe):”, prediction) # Print classification results

    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense

    x = np.random.randint(0, 100, size=(1000, 3))
    y = (x[:,0] + x[:,1] + x[:,2] > 150).astype(int)  # 1 = malware, 0 = safe

    model = Sequential()
    model.add(Dense(32, input_shape=(3,), activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(x, y, epochs=50, batch_size=32, verbose=0)

    new_files = np.array([[60, 50, 50], [10, 5, 15]])
    prediction = model.predict(new_files) > 0.5
    print("Malware predictions (True=Malware, False=Safe):", prediction)
  • Machine Learning

    Machine Learning (ML)

    Machine Learning (ML) is a branch of artificial intelligence that allows systems to learn from data and enhance their performance on tasks without needing explicit programming. ML algorithms examine data to detect patterns and relationships, which can then be utilized for making predictions, classifications, or decisions. These techniques are commonly applied in areas such as fraud detection, recommendation systems, and predictive analytics. Unlike traditional programming, ML focuses on data-driven learning and can handle both structured and unstructured data.

    Process

    • Training
      • Input data
      • Feature extraction (manual in traditional ML, automatic in deep learning)
      • Model learning
    • Prediction (Inference)
      • New input data
      • Apply trained model
      • Output prediction or classification

    Data Splitting

    • Training set: Used to train the model
    • Validation set: Used to tune and evaluate during training
    • Test set: Used to evaluate final performance on unseen data
    • A common split is 70% / 20% / 10%, but this may vary.

    Example

    import numpy as np # For handling arrays
    from sklearn.feature_extraction.text import CountVectorizer # Convert text to numeric feature vectors
    from sklearn.ensemble import RandomForestClassifier # Machine learning model for classification

    # Input texts (simulated messages) and labels
    texts = np.array([
        ‘Click at this link’, # Suspicious / phishing-like message
        ‘Click at this link to download’, # Suspicious
        ‘Click here to transfer money’, # Suspicious
        ‘My name is Jone’, # Normal / safe message
        ‘How are you’ # Normal / safe message
    ])
    labels = np.array([1, 1, 1, 0, 0]) # 1 = positive/suspicious, 0 = negative/normal
    tags = np.array([“negative”, “positive”]) # Labels for display

    # Extract features from text using Bag-of-Words
    count_vectorizer = CountVectorizer(min_df=1) # Convert text to word frequency vectors
    features = count_vectorizer.fit_transform(texts).toarray() # Learn vocabulary and convert texts to array

    # Train Random Forest classifier
    random_forest_classifier = RandomForestClassifier() # Initialize model
    random_forest_classifier.fit(features, labels) # Train model on features and labels

    # Predict new text
    features = count_vectorizer.transform([‘How are you’]) # Convert new text to feature vector
    prediction = random_forest_classifier.predict(features) # Predict label (0 or 1)
    print(prediction, tags[prediction]) # Print numeric prediction and human-readable tag

    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.ensemble import RandomForestClassifier

    #Input
    texts = np.array(['Click at this link', 'Click at this link to download', 'Click here to transfer money', 'My name is Jone', 'How are you'])
    labels = np.array([1, 1, 1, 0, 0])
    #0 = negative
    #1 = positive
    tags = np.array(["negative","positive"])

    #Extract Features
    count_vectorizer = CountVectorizer(min_df=1)
    features = count_vectorizer.fit_transform(texts).toarray()

    #Train
    random_forest_classifier = RandomForestClassifier()
    random_forest_classifier.fit(features, labels)

    #Predict
    features = count_vectorizer.transform(['How are you'])
    prediction = random_forest_classifier.predict(features)
    print(prediction, tags[prediction])
  • Natural Language Processing

    Natural Language Processing

    Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on enabling computers to understand, interpret, and interact with human language in a meaningful way. It combines linguistics, computer science, and machine learning to process text and speech, allowing machines to analyze syntax, semantics, and context in written or spoken language. NLP is used for tasks such as sentiment analysis, language translation, chatbots, information extraction, and text summarization. While NLP focuses on understanding and interpreting language, rather than predicting future events, it forms the foundation for applications that require machines to comprehend and respond to human communication in a natural, human-like manner.


    Text Pre-Processing

    There is a popular module in Python called nltk that used for NLP methodology. This module can be used to enhance threat detection and response

    Install

    pip3 # Python package installer for Python 3
    install # Command that tells pip to install a package
    nltk # The Natural Language Toolkit library (used for NLP tasks)

    pip3 install nltk

    Run this in Python

    import nltk # Imports the Natural Language Toolkit (NLP library) into your Python script
    nltk.download(‘all’) # Downloads all available NLTK datasets, models, and corpora

    import nltk
    nltk.download('all')

    Breaking Sentences Into Words

    You can break unstructured data and natural language text into chunks of information (Numerical data structure that can be used for machine learning) using a tokenizer. E.g., breaking a sentence words using the word_tokenize() method

    Example

    from nltk.tokenize import word_tokenize # Imports the word_tokenize function from NLTK’s tokenize module
    print(word_tokenize(“Please follow this link.”)) # Tokenizes (splits) the sentence into individual words and punctuation, then prints the resulting list

    from nltk.tokenize import word_tokenize
    print(word_tokenize("Please follow this link."))

    Output

    ['Please', 'follow', 'this', 'link', '.']

    Finding Common Words

    You can find common words in a sentence using the FreqDist() method

    Example

    from nltk.probability import FreqDist # Imports FreqDist class to calculate word frequency distribution
    from nltk.tokenize import word_tokenize # Imports the word_tokenize function to split text into tokens
    tokens = word_tokenize(“Please follow this link.”) # Tokenizes the sentence into individual words and punctuation marks
    FreqDist(tokens).tabulate() # Creates a frequency distribution of the tokens and displays the counts in a formatted table

    from nltk.probability import FreqDist
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize("Please follow this link.")
    FreqDist(tokens).tabulate()

    Output

     Please follow    this    link       . 
          1       1       1       1       1 

    Finding Senetnce Parts

    If you want to find nouns, pronouns, verbs, adjectives, adverbs, prepositions, conjunctions, interjections, etc tags in a sentence, you can use pos_tag() method, you can review all the tags using nltk.help.upenn_tagset()

    Example

    from nltk import pos_tag # Imports the part-of-speech (POS) tagging function
    from nltk.tokenize import word_tokenize # Imports the tokenizer to split text into words
    tokens = word_tokenize(“Please follow this link.”) # Splits the sentence into individual tokens (words and punctuation)
    for token in tokens: # Loops through each token
        print(pos_tag([token])) # Tags the token with its part of speech and prints it

    from nltk import pos_tag
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize("Please follow this link.")
    for token in tokens:
        print(pos_tag([token]))

    Output

    [('Please', 'VB')]
    [('follow', 'NN')]
    [('this', 'DT')]
    [('link', 'NN')]
    [('.', '.')]

    Normalizing Words

    If you want to normalize a word, you can use the PorterStemmer() method or lemmatize(). Stemming removes the last few characters from a word (It removes the suffix from the word), whereas lemmatization replaces a word with its root or head (It returns the lemma of the word). Usually, search engines use them to analyze the meaning of a word, then use that to return search results that include all relevant forms of that word used. E.g., if you search for cars, you also get result for car. Bots, use that to understand the overall meaning of the sentence.

    Example

    from nltk.stem import PorterStemmer # Imports the Porter Stemmer algorithm for word stemming
    for item in [“test”, “tests”, “testing”, “tested”]: # Loops through each word in the list
        print(item, “: “, PorterStemmer().stem(item)) # Applies stemming to each word and prints the original word along with its stemmed (root) form

    from nltk.stem import PorterStemmer
    for item in ["test","tests","testing","tested"]:
        print(item, ": ",PorterStemmer().stem(item))

    Output

    test

    Example

    from nltk.stem import WordNetLemmatizer # Imports the WordNet lemmatizer (uses vocabulary + morphology rules)
    for item in [“test”, “tests”, “testing”, “tested”]: # Loops through each word in the list
        print(item, “: “, WordNetLemmatizer().lemmatize(item)) # Lemmatizes (reduces to dictionary base form) each word and prints the original word with its lemma

    from nltk.stem import WordNetLemmatizer
    for item in ["test","tests","testing","tested"]:
        print(item, ": ", WordNetLemmatizer().lemmatize(item))

    Output

    testing

    Example

    from nltk.stem import WordNetLemmatizer  # Imports the WordNet lemmatizer
    from nltk.corpus import wordnet  # Imports WordNet corpus (provides POS constants)
    from nltk import word_tokenize, pos_tag # Imports tokenizer and POS tagger
    from collections import defaultdict # (Not used here, but commonly used for default dictionary behavior)
    mapped = {
        “V”: wordnet.VERB, # Maps POS tags starting with ‘V’ to VERB
        “J”: wordnet.ADJ, # Maps POS tags starting with ‘J’ to ADJECTIVE
        “R”: wordnet.ADV  # Maps POS tags starting with ‘R’ to ADVERB
    }
    tokens = word_tokenize(“caring”) # Tokenizes the word
    for token, tag in pos_tag(tokens): # Tags the token with its Penn Treebank POS tag (e.g., VBG, NN, JJ)
        tag = mapped.get(tag[0], wordnet.NOUN) # Looks at the first letter of the POS tag, of it exists in the mapped dictionary, use the corresponding WordNet POS, otherwise, default to NOUN
        print(token, WordNetLemmatizer().lemmatize(token, tag)) # Lemmatizes the token using the correct POS

    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    from nltk import word_tokenize, pos_tag
    from collections import defaultdict

    mapped = {
        "V": wordnet.VERB,
        "J": wordnet.ADJ,
        "R": wordnet.ADV
    }

    tokens = word_tokenize("caring")
    for token, tag in pos_tag(tokens):
        tag  = mapped.get(tag[0], wordnet.NOUN)
        print(token, WordNetLemmatizer().lemmatize(token, tag))

    Part-Of-Speech

    POS stands for Part-Of-Speech, which is a grammatical category assigned to each word in a sentence. POS tagging tells you whether a word is a noun, verb, adjective, adverb, etc., based on its role in the sentence

    CC Coordinating conjunction
    CD Cardinal number
    DT Determiner
    EX Existential there 
    FW Foreign word
    IN Preposition or subordinating conjunction
    JJ Adjective
    JJR Adjective, comparative
    JJS Adjective, superlative
    LS List item marker
    MD Modal
    NN Noun, singular or mass
    NNS Noun, plural
    NNP Proper noun, singular
    NNPS Proper noun, plural
    PDT Predeterminer
    POS Possessive ending
    PRP Personal pronoun
    PRP$ Possessive pronoun
    RB Adverb
    RBR Adverb, comparative
    RBS Adverb, superlative
    RP Particle
    SYM Symbol
    TO to
    UH Interjection
    VB Verb, base form
    VBD Verb, past tense
    VBG Verb, gerund or present participle
    VBN Verb, past participle
    VBP Verb, non-3rd person singular present
    VBZ Verb, 3rd person singular present
    WDT Wh-determiner
    WP Wh-pronoun
    WP$ Possessive wh-pronoun
    WRB Wh-adverb

    Remove Stops Words

    If you want to remove stopwords from a sentence, you can compare the words of the sentence with the stopwords

    Example

    from nltk.tokenize import sent_tokenize, word_tokenize # Import sentence and word tokenizers
    from nltk.corpus import stopwords # Import stopwords list
    tokens = word_tokenize(“Please followw this link.”) # Tokenize sentence into words
    stop_words = set(stopwords.words(‘english’)) # Get the set of English stopwords
    filtered = [w for w in tokens if w.lower() not in stop_words] # Filter out tokens that are stopwords
    print(filtered) # Print the filtered words

    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    tokens = word_tokenize("Please followw this link.")
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in tokens if w not in stop_words]
    print(filtered)

    Output

    ['Please', 'followw', 'link', '.']

    Example #1

    You can clean text using regex and nltk

    import re # Import regular expressions for pattern-based text cleaning
    from nltk.corpus import stopwords # Import list of common English stopwords
    def clean_text(text):
        text = text.lower() # Convert all letters to lowercase so that ‘This’ and ‘this’ are treated the same
        text = re.sub(r’\d+’, ‘ ‘, text) # Remove all digits/numbers by replacing them with a space
        text = re.sub(r'[^\w\s]’, ‘ ‘, text) # Remove punctuation by replacing anything that is NOT a word character or whitespace with a space
        text = ” “.join(w for w in text.split() if w not in set(stopwords.words(‘english’))) # Remove stopwords (common words like ‘the’, ‘is’, ‘this’)
        return text # Return the cleaned text
    print(clean_text(“Please follow this link.”)) # Expected output: “please follow link”

    import re
    from nltk.corpus import stopwords

    def clean_text(text):
        text = text.lower()
        text = re.sub(r'\d+', ' ', text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = " ".join(w for w in text.split() if w not in set(stopwords.words('english')))
        return text

    print(clean_text("Please follow this link."))

    Output

    please follow link

    Example #2

    If you want to check a phishing email for broken words, you can do that using nltk module

    import nltk # Import NLTK library
    words = set(nltk.corpus.words.words()) # Load the set of valid English words from the NLTK corpus
    sentence = “Please followw this link.” # Example sentence to check
    errors = [] # List to store words not found in the dictionary (possible typos)
    for w in nltk.wordpunct_tokenize(sentence): # Tokenize the sentence into words and punctuation
        if w.lower() in words or not w.isalpha(): # Check if the word is in the dictionary or is non-alphabetic (punctuation, numbers)
            pass # Word is correct or ignored
        else:
            errors.append(w) # Word is likely a typo
    print(“Error(s): “, len(errors)) # Print the number of errors found

    import nltk 
    words = set(nltk.corpus.words.words())
    sentence = "Please followw this link."
    errors = []
    for w in nltk.wordpunct_tokenize(sentence):
        if w.lower() in words or not w.isalpha():
            pass
        else:
            errors.append(w)
    print("Error(s): ", len(errors))

    Output

    Error(s): 1