Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Aho-Corasick algorithm with C language

I have programmed an Aho-Corasick algorithm with a transition table that searches for a set of words in a text and displays the number of occurrences by using malloc(), but I am encountering this error:

Segmentation fault (core dumped)

Here is the algorithm that I have programmed

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define ALPHABET_SIZE 70
#define MAX_WORD_LENGTH 100000

typedef struct Element Element;
struct Element {
    int nombre;
    Element *suivant;
};

typedef struct File File;

struct File {
    Element *premier;
    Element *dernier;
};

File *creerfile() {
    File *file = (File *)malloc(sizeof(File));
    if (file == NULL) {
        perror("Erreur d'allocation pour la file");
        exit(EXIT_FAILURE);
    }

    file->premier = NULL;
    file->dernier = NULL;

    return file;
}

int estVide(File *file) {
    return file->premier == NULL;
}

void enfiler(File *file, int element) {
    Element *nouveau = malloc(sizeof(*nouveau));
    if (nouveau == NULL) {
        perror("Erreur d'allocation pour un élément de la file");
        exit(EXIT_FAILURE);
    }

    nouveau->nombre = element;
    nouveau->suivant = NULL;

    if (file->premier != NULL) {
        file->dernier->suivant = nouveau;
    } else {
        file->premier = nouveau;
    }

    file->dernier = nouveau;
}

int defiler(File *file) {
    if (file == NULL || file->premier == NULL) {
        perror("Erreur de défiler : file vide");
        exit(EXIT_FAILURE);
    }

    int nombreDefile = file->premier->nombre;
    Element *elementDefile = file->premier;

    file->premier = elementDefile->suivant;
    free(elementDefile);

    if (file->premier == NULL) {
        file->dernier = NULL;
    }

    return nombreDefile;
}

struct _trie {
    int maxNode;
    int nextNode;
    int **transition;
    char *finale;
    int *supp;
};

typedef struct _trie Trie;

void initialisationTrie(Trie *trie, int maxNode) {
    trie->maxNode = maxNode;
    trie->nextNode = 1;
    trie->transition = (int **)malloc(maxNode * sizeof(int *));
    if (trie->transition == NULL) {
        perror("Erreur d'allocation pour le tableau de transitions");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < maxNode; ++i) {
        trie->transition[i] = (int *)malloc(ALPHABET_SIZE * sizeof(int));
        if (trie->transition[i] == NULL) {
            perror("Erreur d'allocation pour une ligne du tableau de transitions");
            exit(EXIT_FAILURE);
        }
        for (int j = 0; j < ALPHABET_SIZE; ++j) {
            trie->transition[i][j] = 0;
        }
    }

    trie->finale = (char *)malloc(maxNode * sizeof(char));
    if (trie->finale == NULL) {
        perror("Erreur d'allocation pour le tableau finale");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < maxNode; ++i) {
        trie->finale[i] = 0;
    }

    trie->supp = (int *)malloc(maxNode * sizeof(int));
    if (trie->supp == NULL) {
        perror("Erreur d'allocation pour le tableau supp");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < maxNode; ++i) {
        trie->supp[i] = 0;
    }
}

void ajoutmot(Trie *trie, char mot[MAX_WORD_LENGTH]) {
    int courant = 0;
    for (int i = 0; i < strlen(mot); ++i) {
        char c = mot[i];
        int index;
        if (c >= 'a' && c <= 'z') {
            index = c - 'a';
        } else {
            // Caractère spécial, traitez-le comme une lettre
            index = c - 'A' + 26;  // Ajoutez 26 pour l'ajustement
        }

        if (trie->transition[courant][index] == 0) {
            trie->transition[courant][index] = trie->nextNode;
            trie->nextNode++;
        }

        courant = trie->transition[courant][index];
    }
    trie->finale[courant] = 1;
}

void constructionsuppleants(Trie *trie) {
    File *file = creerfile();
    for (int i = 0; i < ALPHABET_SIZE; ++i) {
        if (trie->transition[0][i] != 0) {
            enfiler(file, trie->transition[0][i]);
            trie->supp[trie->transition[0][i]] = 0;
        }
    }

    while (!estVide(file)) {
        int r = defiler(file);
        for (int i = 0; i < ALPHABET_SIZE; ++i) {
            int s = trie->transition[r][i];
            if (s != 0) {
                enfiler(file, s);
                int etat = trie->supp[r];
                while (trie->transition[etat][i] == 0 && etat != 0) {
                    etat = trie->supp[etat];
                }
                trie->supp[s] = trie->transition[etat][i] != 0 ? trie->transition[etat][i] : 0;
            }
        }
    }

    free(file);
}

int AhoCorrasick(Trie *trie, char *text) {
    int cmpt = 0;
    int etat = 0;
    for (int i = 0; text[i] != '\0'; ++i) {
        char c = text[i];
        int index;
        if (c >= 'a' && c <= 'z') {
            index = c - 'a';
        } else {
            // Traitez le caractère spécial comme une lettre
            index = c - 'a' + 26;
        }

        while (trie->transition[etat][index] == 0 && etat != 0) {
            etat = trie->supp[etat];
        }

        etat = trie->transition[etat][index] != 0 ? trie->transition[etat][index] : 0;

        int etatTemporaire = etat;
        while (etatTemporaire != 0) {
            if (trie->finale[etatTemporaire] == 1) {
                cmpt++;
            }
            etatTemporaire = trie->supp[etatTemporaire];
        }
    }

    return cmpt;
}

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "aho_corrasick_matrice.c"
#include "aho_corrasick_matrice.h"

#define ALPHABET_SIZE 70
#define BUFFER_SIZE 4096
#define MAX_WORD_LENGTH 100000

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s mots.txt texte.txt\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    Trie trie;
    long int maxNode = 500000;
    initialisationTrie(&trie, maxNode);

    FILE *motsFile = fopen(argv[1], "r");
    if (motsFile == NULL) {
        fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[1]);
        exit(EXIT_FAILURE);
    }

    char mot[MAX_WORD_LENGTH];
    while (fscanf(motsFile, "%s", mot) == 1) {
        ajoutmot(&trie, mot);
    }

    fclose(motsFile);

    constructionsuppleants(&trie);

    FILE *texteFile = fopen(argv[2], "r");
    if (texteFile == NULL) {
        fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[2]);
        exit(EXIT_FAILURE);
    }

    char buffer[BUFFER_SIZE];
    int occurrences = 0;

    while (fgets(buffer, sizeof(buffer), texteFile) != NULL) {
        occurrences += AhoCorrasick(&trie, buffer);
    }

    printf("Nombre d'occurrences est : %d\n", occurrences);

    fclose(texteFile);

    free(trie.finale);
    for (int i = 0; i < maxNode; ++i) {
        free(trie.transition[i]);
    }
    free(trie.transition);
    free(trie.supp);

    return 0;
}

For searching sets in a text with an alphabet size of 20, it works well, but with an alphabet size of 70, it doesn't work, and I get this segmentation fault error.

like image 809
Zazou Avatar asked Dec 21 '25 06:12

Zazou


1 Answers

There are at least these problems:

  • you convert the characters into index values that can be larger than ALPHABET_SIZE, invoking undefined behavior when you index into the transitions tables.

  • the initial value maxNode of 500000 might be too small if the dictionary is large. You do not check if trie->nextNode stays inside the array boundaries in ajoutmot (ie: trie->nextNode < trie->maxNode), causing undefined behavior when writing to trie->transition[courant][index] or trie->finale[courant] with courant >= trie->maxNode.

Here is a modified version of ajoutmot and AhoCorrasick using a normalized index value:

int charindex(char c) {
#if ALPHABET_SIZE >= 64
    if (c >= 'a' && c <= 'z') {
        return c - 'a';
    } else
    if (c >= 'A' && c <= 'Z') {
        return c - 'A' + 26;
    } else
    if (c >= '0' && c <= '9') {
        return c - '0' + 52;
    } else
    if (c == ' ') {
        return 62;
    } else {
        // Caractère spécial, traitez-les comme une lettre supplémentaire
        return 63;
    }
#else
#error ALPHABET_SIZE too small
#endif
}

void ajoutmot(Trie *trie, char mot[MAX_WORD_LENGTH]) {
    int courant = 0;
    for (size_t i = 0; i < strlen(mot); ++i) {
        char c = mot[i];
        int index = charindex(c);

        if (trie->transition[courant][index] == 0) {
            if (trie->nextNode >= trie->maxNode) {
                fprintf(stderr, "Erreur: tableau de transitions plein pour le mot %s\n", mot);
                exit(EXIT_FAILURE);
            }
            trie->transition[courant][index] = trie->nextNode;
            trie->nextNode++;
        }
        courant = trie->transition[courant][index];
    }
    trie->finale[courant] = 1;
}

int AhoCorrasick(Trie *trie, char *text) {
    int cmpt = 0;
    int etat = 0;
    for (int i = 0; text[i] != '\0'; ++i) {
        char c = text[i];
        int index = charindex(c);

        while (trie->transition[etat][index] == 0 && etat != 0) {
            etat = trie->supp[etat];
        }

        etat = trie->transition[etat][index] != 0 ? trie->transition[etat][index] : 0;

        int etatTemporaire = etat;
        while (etatTemporaire != 0) {
            if (trie->finale[etatTemporaire] == 1) {
                cmpt++;
            }
            etatTemporaire = trie->supp[etatTemporaire];
        }
    }

    return cmpt;
}

This might not fix all the problems, but the segmentation faults should be prevented. Note that a segmentation fault is an easy bug to fix as the debugger will point to the offending code immediately.

like image 59
chqrlie Avatar answered Dec 23 '25 20:12

chqrlie



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!