I have programmed an Aho-Corasick algorithm with a transition table that searches for a set of words in a text and displays the number of occurrences by using malloc(), but I am encountering this error:
Segmentation fault (core dumped)
Here is the algorithm that I have programmed
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ALPHABET_SIZE 70
#define MAX_WORD_LENGTH 100000
typedef struct Element Element;
struct Element {
int nombre;
Element *suivant;
};
typedef struct File File;
struct File {
Element *premier;
Element *dernier;
};
File *creerfile() {
File *file = (File *)malloc(sizeof(File));
if (file == NULL) {
perror("Erreur d'allocation pour la file");
exit(EXIT_FAILURE);
}
file->premier = NULL;
file->dernier = NULL;
return file;
}
int estVide(File *file) {
return file->premier == NULL;
}
void enfiler(File *file, int element) {
Element *nouveau = malloc(sizeof(*nouveau));
if (nouveau == NULL) {
perror("Erreur d'allocation pour un élément de la file");
exit(EXIT_FAILURE);
}
nouveau->nombre = element;
nouveau->suivant = NULL;
if (file->premier != NULL) {
file->dernier->suivant = nouveau;
} else {
file->premier = nouveau;
}
file->dernier = nouveau;
}
int defiler(File *file) {
if (file == NULL || file->premier == NULL) {
perror("Erreur de défiler : file vide");
exit(EXIT_FAILURE);
}
int nombreDefile = file->premier->nombre;
Element *elementDefile = file->premier;
file->premier = elementDefile->suivant;
free(elementDefile);
if (file->premier == NULL) {
file->dernier = NULL;
}
return nombreDefile;
}
struct _trie {
int maxNode;
int nextNode;
int **transition;
char *finale;
int *supp;
};
typedef struct _trie Trie;
void initialisationTrie(Trie *trie, int maxNode) {
trie->maxNode = maxNode;
trie->nextNode = 1;
trie->transition = (int **)malloc(maxNode * sizeof(int *));
if (trie->transition == NULL) {
perror("Erreur d'allocation pour le tableau de transitions");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->transition[i] = (int *)malloc(ALPHABET_SIZE * sizeof(int));
if (trie->transition[i] == NULL) {
perror("Erreur d'allocation pour une ligne du tableau de transitions");
exit(EXIT_FAILURE);
}
for (int j = 0; j < ALPHABET_SIZE; ++j) {
trie->transition[i][j] = 0;
}
}
trie->finale = (char *)malloc(maxNode * sizeof(char));
if (trie->finale == NULL) {
perror("Erreur d'allocation pour le tableau finale");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->finale[i] = 0;
}
trie->supp = (int *)malloc(maxNode * sizeof(int));
if (trie->supp == NULL) {
perror("Erreur d'allocation pour le tableau supp");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->supp[i] = 0;
}
}
void ajoutmot(Trie *trie, char mot[MAX_WORD_LENGTH]) {
int courant = 0;
for (int i = 0; i < strlen(mot); ++i) {
char c = mot[i];
int index;
if (c >= 'a' && c <= 'z') {
index = c - 'a';
} else {
// Caractère spécial, traitez-le comme une lettre
index = c - 'A' + 26; // Ajoutez 26 pour l'ajustement
}
if (trie->transition[courant][index] == 0) {
trie->transition[courant][index] = trie->nextNode;
trie->nextNode++;
}
courant = trie->transition[courant][index];
}
trie->finale[courant] = 1;
}
void constructionsuppleants(Trie *trie) {
File *file = creerfile();
for (int i = 0; i < ALPHABET_SIZE; ++i) {
if (trie->transition[0][i] != 0) {
enfiler(file, trie->transition[0][i]);
trie->supp[trie->transition[0][i]] = 0;
}
}
while (!estVide(file)) {
int r = defiler(file);
for (int i = 0; i < ALPHABET_SIZE; ++i) {
int s = trie->transition[r][i];
if (s != 0) {
enfiler(file, s);
int etat = trie->supp[r];
while (trie->transition[etat][i] == 0 && etat != 0) {
etat = trie->supp[etat];
}
trie->supp[s] = trie->transition[etat][i] != 0 ? trie->transition[etat][i] : 0;
}
}
}
free(file);
}
int AhoCorrasick(Trie *trie, char *text) {
int cmpt = 0;
int etat = 0;
for (int i = 0; text[i] != '\0'; ++i) {
char c = text[i];
int index;
if (c >= 'a' && c <= 'z') {
index = c - 'a';
} else {
// Traitez le caractère spécial comme une lettre
index = c - 'a' + 26;
}
while (trie->transition[etat][index] == 0 && etat != 0) {
etat = trie->supp[etat];
}
etat = trie->transition[etat][index] != 0 ? trie->transition[etat][index] : 0;
int etatTemporaire = etat;
while (etatTemporaire != 0) {
if (trie->finale[etatTemporaire] == 1) {
cmpt++;
}
etatTemporaire = trie->supp[etatTemporaire];
}
}
return cmpt;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "aho_corrasick_matrice.c"
#include "aho_corrasick_matrice.h"
#define ALPHABET_SIZE 70
#define BUFFER_SIZE 4096
#define MAX_WORD_LENGTH 100000
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s mots.txt texte.txt\n", argv[0]);
exit(EXIT_FAILURE);
}
Trie trie;
long int maxNode = 500000;
initialisationTrie(&trie, maxNode);
FILE *motsFile = fopen(argv[1], "r");
if (motsFile == NULL) {
fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[1]);
exit(EXIT_FAILURE);
}
char mot[MAX_WORD_LENGTH];
while (fscanf(motsFile, "%s", mot) == 1) {
ajoutmot(&trie, mot);
}
fclose(motsFile);
constructionsuppleants(&trie);
FILE *texteFile = fopen(argv[2], "r");
if (texteFile == NULL) {
fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[2]);
exit(EXIT_FAILURE);
}
char buffer[BUFFER_SIZE];
int occurrences = 0;
while (fgets(buffer, sizeof(buffer), texteFile) != NULL) {
occurrences += AhoCorrasick(&trie, buffer);
}
printf("Nombre d'occurrences est : %d\n", occurrences);
fclose(texteFile);
free(trie.finale);
for (int i = 0; i < maxNode; ++i) {
free(trie.transition[i]);
}
free(trie.transition);
free(trie.supp);
return 0;
}
For searching sets in a text with an alphabet size of 20, it works well, but with an alphabet size of 70, it doesn't work, and I get this segmentation fault error.
There are at least these problems:
you convert the characters into index values that can be larger than ALPHABET_SIZE, invoking undefined behavior when you index into the transitions tables.
the initial value maxNode of 500000 might be too small if the dictionary is large. You do not check if trie->nextNode stays inside the array boundaries in ajoutmot (ie: trie->nextNode < trie->maxNode), causing undefined behavior when writing to trie->transition[courant][index] or trie->finale[courant] with courant >= trie->maxNode.
Here is a modified version of ajoutmot and AhoCorrasick using a normalized index value:
int charindex(char c) {
#if ALPHABET_SIZE >= 64
if (c >= 'a' && c <= 'z') {
return c - 'a';
} else
if (c >= 'A' && c <= 'Z') {
return c - 'A' + 26;
} else
if (c >= '0' && c <= '9') {
return c - '0' + 52;
} else
if (c == ' ') {
return 62;
} else {
// Caractère spécial, traitez-les comme une lettre supplémentaire
return 63;
}
#else
#error ALPHABET_SIZE too small
#endif
}
void ajoutmot(Trie *trie, char mot[MAX_WORD_LENGTH]) {
int courant = 0;
for (size_t i = 0; i < strlen(mot); ++i) {
char c = mot[i];
int index = charindex(c);
if (trie->transition[courant][index] == 0) {
if (trie->nextNode >= trie->maxNode) {
fprintf(stderr, "Erreur: tableau de transitions plein pour le mot %s\n", mot);
exit(EXIT_FAILURE);
}
trie->transition[courant][index] = trie->nextNode;
trie->nextNode++;
}
courant = trie->transition[courant][index];
}
trie->finale[courant] = 1;
}
int AhoCorrasick(Trie *trie, char *text) {
int cmpt = 0;
int etat = 0;
for (int i = 0; text[i] != '\0'; ++i) {
char c = text[i];
int index = charindex(c);
while (trie->transition[etat][index] == 0 && etat != 0) {
etat = trie->supp[etat];
}
etat = trie->transition[etat][index] != 0 ? trie->transition[etat][index] : 0;
int etatTemporaire = etat;
while (etatTemporaire != 0) {
if (trie->finale[etatTemporaire] == 1) {
cmpt++;
}
etatTemporaire = trie->supp[etatTemporaire];
}
}
return cmpt;
}
This might not fix all the problems, but the segmentation faults should be prevented. Note that a segmentation fault is an easy bug to fix as the debugger will point to the offending code immediately.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With