NO

Author Topic: regex  (Read 2253 times)

czerny

  • Guest
regex
« on: January 20, 2015, 11:15:20 am »
The following example should output

Code: [Select]
Trying to find '([[:digit:]]+)[^[:digit:]]+([[:digit:]]+)' in 'This 1 is nice 2 so 33 for 4254'
$& is '1 is nice 2' (bytes 5:16)
$1 is '1' (bytes 5:6)
$2 is '2' (bytes 15:16)
$& is '33 for 4254' (bytes 20:31)
$1 is '33' (bytes 20:22)
$2 is '4254' (bytes 27:31)
No more matches.

The first match, finding '1' and '2' is working ok. But after that _regexec() is crashing.
I can't find any error!  :(

Code: [Select]
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

// The following is the size of a buffer to contain any error messages encountered when the regular expression is compiled.

#define MAX_ERROR_MSG 0x1000

// Compile the regular expression described by "regex_text" into "r".

static int compile_regex (_regex_t * r, const char * regex_text)
{
    int status = _regcomp (r, regex_text, _REG_EXTENDED|_REG_NEWLINE);
    if (status != 0) {
char error_message[MAX_ERROR_MSG];
_regerror (status, r, error_message, MAX_ERROR_MSG);
        printf ("Regex error compiling '%s': %s\n",
                 regex_text, error_message);
        return 1;
    }
    return 0;
}

// Match the string in "to_match" against the compiled regular expression in "r".

static int match_regex (_regex_t * r, const char * to_match)
{
    // "P" is a pointer into the string which points to the end of the previous match.
    const char * p = to_match;
    // "N_matches" is the maximum number of matches allowed.
    const int n_matches = 10;
    // "M" contains the matches found.
    _regmatch_t m[n_matches];

    while (1) {
        int i = 0;
        int nomatch = _regexec (r, p, n_matches, m, 0);
        if (nomatch) {
            printf ("No more matches.\n");
            return nomatch;
        }
        for (i = 0; i < n_matches; i++) {
            int start;
            int finish;
            if (m[i].rm_so == -1) {
                break;
            }
            start = m[i].rm_so + (p - to_match);
            finish = m[i].rm_eo + (p - to_match);
            if (i == 0) {
                printf ("$& is ");
            }
            else {
                printf ("$%d is ", i);
            }
            printf ("'%.*s' (bytes %d:%d)\n", (finish - start),
                    to_match + start, start, finish);
        }
        p += m[0].rm_eo;
    }
    return 0;
}

int main(int argc, char ** argv)
{
    _regex_t r;
    const char * regex_text;
    const char * find_text;
    if (argc != 3) {
        regex_text = "([[:digit:]]+)[^[:digit:]]+([[:digit:]]+)";
        find_text = "This 1 is nice 2 so 33 for 4254";
    }
    else {
        regex_text = argv[1];
        find_text = argv[2];
    }
    printf ("Trying to find '%s' in '%s'\n", regex_text, find_text);
    compile_regex(& r, regex_text);
    match_regex(& r, find_text);
    _regfree (& r);
    return 0;
}

Offline frankie

  • Global Moderator
  • Member
  • *****
  • Posts: 1698
Re: regex
« Reply #1 on: January 20, 2015, 01:37:05 pm »
Code: [Select]
        //p += m[0].rm_eo;
        p += m[i].rm_eo;

czerny

  • Guest
Re: regex
« Reply #2 on: January 20, 2015, 01:50:52 pm »
This is wrong! 'p' should point to the rest string (" 2 so 33 for 4254") to find '33' and '4254'.
Code: [Select]
        //p += m[0].rm_eo;
        p += m[i].rm_eo;
So we have no crash, but no second match too! :(
« Last Edit: January 20, 2015, 01:57:14 pm by czerny »

Offline frankie

  • Global Moderator
  • Member
  • *****
  • Posts: 1698
Re: regex
« Reply #3 on: January 20, 2015, 02:45:41 pm »
Code: [Select]
static int match_regex (_regex_t * r, const char * to_match)
{
    // "P" is a pointer into the string which points to the end of the previous match.
    const char * p = to_match;
    // "N_matches" is the maximum number of matches allowed.
    const int n_matches = 10;
    // "M" contains the matches found.
    _regmatch_t m[n_matches];

    while (1) {
        int i = 0;
        int nomatch = _regexec (r, p, n_matches, m, 0);
        if (nomatch) {
            printf ("No more matches.\n");
            return nomatch;
        }
        for (i = 0; i < n_matches; i++) {
            int start;
            int finish;
            if (m[i].rm_so == -1) {
                break;
            }
            start = m[i].rm_so + (p - to_match);
            finish = m[i].rm_eo + (p - to_match);
            if (i == 0) {
                printf ("$& is ");
            }
            else {
                printf ("$%d is ", i);
            }
            printf ("'%.*s' (bytes %d:%d)\n", (finish - start),
                    to_match + start, start, finish);
        }
        p += m[0].rm_eo;
    }
    return 0;
}
Look your code where is the _regexec inside the for loop?
Or you intended it inside the while loop?
The code is not so much readable...

czerny

  • Guest
Re: regex
« Reply #4 on: January 20, 2015, 03:00:39 pm »
Look your code where is the _regexec inside the for loop?
Or you intended it inside the while loop?
It is not (and should not be) inside the for loop.
It is in the while loop and that's ok!

Offline frankie

  • Global Moderator
  • Member
  • *****
  • Posts: 1698
Re: regex
« Reply #5 on: January 20, 2015, 05:10:12 pm »
Compliments! You won! This is another bug.  ;D
If you change to:
Code: [Select]
// Match the string in "to_match" against the compiled regular expression in "r".

#define NMATCHES 10
static int match_regex (_regex_t * r, const char * to_match)
{
    // "P" is a pointer into the string which points to the end of the previous match.
    const char * p = to_match;
    // "N_matches" is the maximum number of matches allowed.
    const int n_matches = NMATCHES;
    // "M" contains the matches found.
    _regmatch_t m[NMATCHES];

    while (1) {
        int i = 0;
        int nomatch = _regexec (r, p, n_matches, m, 0);
        if (nomatch) {
            printf ("No more matches.\n");
            return nomatch;
        }
        for (i = 0; i < n_matches; i++) {
            int start;
            int finish;
            if (m[i].rm_so == -1) {
                break;
            }
            start = m[i].rm_so + (p - to_match);
            finish = m[i].rm_eo + (p - to_match);
            if (i == 0) {
                printf ("$& is ");
            }
            else {
                printf ("$%d is ", i);
            }
            printf ("'%.*s' (bytes %d:%d)\n", (finish - start),
                    to_match + start, start, finish);
        }
        p += m[0].rm_eo;
    }
    return 0;
}
It works.
For some Strange reason the compiler restores the stack pointer in the while loop (mov esp,ebx), before thescope of the variable is terminated!  >:(
I'm moving this topic to bug reports.

czerny

  • Guest
Re: regex
« Reply #6 on: January 20, 2015, 07:02:48 pm »
Btw. this seems to be a new bug. With 7RC4 all is ok.
« Last Edit: January 20, 2015, 07:08:02 pm by czerny »