Updated regexp to use Cargo, as well.
Getty Ritter
10 years ago
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | // use std::vec::Vec; | |
3 | ||
4 | /* A regular expression parse tree */ | |
5 | #[deriving(Show)] | |
6 | enum Regexp { | |
7 | RChar(char), | |
8 | RSeq(Box<Regexp>, Box<Regexp>), | |
9 | RChc(Box<Regexp>, Box<Regexp>), | |
10 | RRep(Box<Regexp>), | |
11 | } | |
12 | ||
13 | /* We're assuming a prefix regexp here. That means that we have | |
14 | * the following operators: | |
15 | * .ab => ab | |
16 | * |ab => a|b | |
17 | * *a => a* | |
18 | * but these nest, so (ab|c)* would become | |
19 | * *|c.ab | |
20 | * This is easier to parse. Deal with it. | |
21 | */ | |
22 | fn parse<'a>(s: &'a str) -> (&'a str, Regexp) { | |
23 | match s.char_at(0) { | |
24 | '.' => { let (s1, r1) = parse(s.slice_from(1)); | |
25 | let (s2, r2) = parse(s1); | |
26 | (s2, RSeq(box r1, box r2)) }, | |
27 | '|' => { let (s1, r1) = parse(s.slice_from(1)); | |
28 | let (s2, r2) = parse(s1); | |
29 | (s2, RChc(box r1, box r2)) }, | |
30 | '*' => { let (s1, r1) = parse(s.slice_from(1)); | |
31 | (s1, RRep(box r1)) }, | |
32 | c => (s.slice_from(1), RChar(c)), | |
33 | } | |
34 | } | |
35 | ||
36 | /* Compiling an AST for regexps to the instructions */ | |
37 | fn emit(r: &Regexp, i: uint) -> (uint, Vec<Instr>) { | |
38 | match *r { | |
39 | RChar(c) => { (i+1, vec![IChar(c)]) }, | |
40 | RSeq(box ref a, box ref b) => | |
41 | { let (ai, mut v1) = emit(a, i); | |
42 | let (bi, v2) = emit(b, ai); | |
43 | v1.push_all_move(v2); | |
44 | (bi, v1) }, | |
45 | RChc(box ref a, box ref b) => | |
46 | { let (ai, v1) = emit(a, i + 1); | |
47 | let (bi, v2) = emit(b, ai + 1); | |
48 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
49 | let jmp = vec![ IJmp(ai) ]; | |
50 | spl.push_all_move(v1); | |
51 | spl.push_all_move(jmp); | |
52 | spl.push_all_move(v2); | |
53 | (bi, spl) }, | |
54 | RRep(box ref a) => | |
55 | { let (ai, v1) = emit(a, i + 1); | |
56 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
57 | let jmp = vec![ IJmp(i) ]; | |
58 | spl.push_all_move(v1); | |
59 | spl.push_all_move(jmp); | |
60 | (ai + 1, spl) }, | |
61 | } | |
62 | } | |
63 | ||
64 | /* A wrapper over these processes */ | |
65 | pub fn compile(s: &str) -> Vec<Instr> { | |
66 | let (_, re) = parse(s); | |
67 | println!("{}", re); | |
68 | let (_, ins) = emit(&re, 0); | |
69 | println!("{}", ins); | |
70 | return ins.append([IMatch]); | |
71 | } |
1 | /* A single instruction as used in the VM-based matcher */ | |
2 | #[deriving(Clone,Show)] | |
3 | pub enum Instr { | |
4 | IChar(char), /* match a character or fail */ | |
5 | IMatch, /* match anything successfully */ | |
6 | IJmp(uint) , /* jump to instr i */ | |
7 | ISplit(uint, uint), /* try both instrs i and j */ | |
8 | } |
1 | pub use re::compile::compile; | |
2 | pub use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
3 | pub use re::recursive::eval; | |
4 | pub use re::stack::eval; | |
5 | pub mod compile; | |
6 | pub mod instruction; | |
7 | pub mod recursive; | |
8 | pub mod stack; |
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | ||
3 | /* We wrap the real evaluation function, as we're always going to | |
4 | * start executing instruction 0 with no string matched. */ | |
5 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
6 | eval1(instrs, input, 0, 0) | |
7 | } | |
8 | ||
9 | /* We use the Rust stack as our stack in this naive recursive | |
10 | * implementation. */ | |
11 | fn eval1(instrs: &[Instr], input: &str, pc: uint, cc: uint) -> bool { | |
12 | match instrs[pc] { | |
13 | IChar(_) if cc >= input.len() => return false, | |
14 | IChar(c) if c == input.char_at(cc) => | |
15 | eval1(instrs, input, pc + 1, cc + 1), | |
16 | IChar(_) => return false, | |
17 | IMatch => return true, | |
18 | IJmp(i) => eval1(instrs, input, i, cc), | |
19 | ISplit(i, _) if eval1(instrs, input, i, cc) => true, | |
20 | ISplit(_, j) => eval1(instrs, input, j, cc), | |
21 | } | |
22 | } |
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | ||
3 | /* The state of a program can be unambiguously specified by | |
4 | * a current instruction and a current position in the string. */ | |
5 | struct EvalState { pc: uint, cc: uint } | |
6 | ||
7 | /* An evaluator that maintains a manual, mutable stack for doing | |
8 | * regular-expression matching. */ | |
9 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
10 | let mut stack = vec![ EvalState {pc: 0, cc: 0} ]; | |
11 | ||
12 | while stack.len() > 0 { | |
13 | let st = stack.pop().unwrap(); | |
14 | match instrs[st.pc] { | |
15 | IChar(_) if st.cc >= input.len() => | |
16 | continue, | |
17 | IChar(c) if c == input.char_at(st.cc) => | |
18 | stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }), | |
19 | IChar(_) => | |
20 | continue, | |
21 | IMatch => | |
22 | return true, | |
23 | IJmp(i) => | |
24 | stack.push(EvalState { pc: i, cc: st.cc }), | |
25 | ISplit(i, j) => { | |
26 | stack.push(EvalState { pc: j, cc: st.cc }); | |
27 | stack.push(EvalState { pc: i, cc: st.cc }); | |
28 | }, | |
29 | } | |
30 | } | |
31 | return false; | |
32 | } |
1 | DEPS="regexp.rs re/compile.rs re/instruction.rs re/mod.rs re/recursive.rs re/stack.rs" | |
2 | redo-ifchange $DEPS | |
3 | rustc regexp.rs -o $3 |
1 | /* This is a basic implementation of a regular expression matcher, | |
2 | * based on Henry Spencer's virtual-machine approach to regular | |
3 | * expression matching outlined by Russ Cox here: | |
4 | * http://swtch.com/~rsc/regexp/regexp2.html | |
5 | * | |
6 | * For ease of parsing, I'm using a highly non-standard Polish | |
7 | * notation for regular expressions, in which . and | are | |
8 | * prefix binary operators for catenation and choice, respectively, | |
9 | * and * is a prefix unary operator for repetition. */ | |
10 | use re::compile; | |
11 | mod re; | |
12 | ||
13 | fn main() { | |
14 | /* our sample regexp corresponds to /ab*c/ in | |
15 | * the usual notation. */ | |
16 | let re = compile("..a*bc"); | |
17 | println!("Recursive:"); | |
18 | println!(" match(re, \"abbbc\")\t== {}", | |
19 | ::re::recursive::eval(re.as_slice(), "abbbc")); | |
20 | println!(" match(re, \"ac\")\t== {}", | |
21 | ::re::recursive::eval(re.as_slice(), "ac")); | |
22 | println!(" match(re, \"abd\")\t== {}", | |
23 | ::re::recursive::eval(re.as_slice(), "abd")); | |
24 | println!("Manual Stack:"); | |
25 | println!(" match(re, \"abbbc\")\t== {}", | |
26 | ::re::stack::eval(re.as_slice(), "abbbc")); | |
27 | println!(" match(re, \"ac\")\t== {}", | |
28 | ::re::stack::eval(re.as_slice(), "ac")); | |
29 | println!(" match(re, \"abd\")\t== {}", | |
30 | ::re::stack::eval(re.as_slice(), "abd")); | |
31 | } |
1 | /* This is a basic implementation of a regular expression matcher, | |
2 | * based on Henry Spencer's virtual-machine approach to regular | |
3 | * expression matching outlined by Russ Cox here: | |
4 | * http://swtch.com/~rsc/regexp/regexp2.html | |
5 | * | |
6 | * For ease of parsing, I'm using a highly non-standard Polish | |
7 | * notation for regular expressions, in which . and | are | |
8 | * prefix binary operators for catenation and choice, respectively, | |
9 | * and * is a prefix unary operator for repetition. */ | |
10 | use re::compile; | |
11 | mod re; | |
12 | ||
13 | fn main() { | |
14 | /* our sample regexp corresponds to /ab*c/ in | |
15 | * the usual notation. */ | |
16 | let re = compile("..a*bc"); | |
17 | println!("Recursive:"); | |
18 | println!(" match(re, \"abbbc\")\t== {}", | |
19 | ::re::recursive::eval(re.as_slice(), "abbbc")); | |
20 | println!(" match(re, \"ac\")\t== {}", | |
21 | ::re::recursive::eval(re.as_slice(), "ac")); | |
22 | println!(" match(re, \"abd\")\t== {}", | |
23 | ::re::recursive::eval(re.as_slice(), "abd")); | |
24 | println!("Manual Stack:"); | |
25 | println!(" match(re, \"abbbc\")\t== {}", | |
26 | ::re::stack::eval(re.as_slice(), "abbbc")); | |
27 | println!(" match(re, \"ac\")\t== {}", | |
28 | ::re::stack::eval(re.as_slice(), "ac")); | |
29 | println!(" match(re, \"abd\")\t== {}", | |
30 | ::re::stack::eval(re.as_slice(), "abd")); | |
31 | } |
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | // use std::vec::Vec; | |
3 | ||
4 | /* A regular expression parse tree */ | |
5 | #[deriving(Show)] | |
6 | enum Regexp { | |
7 | RChar(char), | |
8 | RSeq(Box<Regexp>, Box<Regexp>), | |
9 | RChc(Box<Regexp>, Box<Regexp>), | |
10 | RRep(Box<Regexp>), | |
11 | } | |
12 | ||
13 | /* We're assuming a prefix regexp here. That means that we have | |
14 | * the following operators: | |
15 | * .ab => ab | |
16 | * |ab => a|b | |
17 | * *a => a* | |
18 | * but these nest, so (ab|c)* would become | |
19 | * *|c.ab | |
20 | * This is easier to parse. Deal with it. | |
21 | */ | |
22 | fn parse<'a>(s: &'a str) -> (&'a str, Regexp) { | |
23 | match s.char_at(0) { | |
24 | '.' => { let (s1, r1) = parse(s.slice_from(1)); | |
25 | let (s2, r2) = parse(s1); | |
26 | (s2, RSeq(box r1, box r2)) }, | |
27 | '|' => { let (s1, r1) = parse(s.slice_from(1)); | |
28 | let (s2, r2) = parse(s1); | |
29 | (s2, RChc(box r1, box r2)) }, | |
30 | '*' => { let (s1, r1) = parse(s.slice_from(1)); | |
31 | (s1, RRep(box r1)) }, | |
32 | c => (s.slice_from(1), RChar(c)), | |
33 | } | |
34 | } | |
35 | ||
36 | /* Compiling an AST for regexps to the instructions */ | |
37 | fn emit(r: &Regexp, i: uint) -> (uint, Vec<Instr>) { | |
38 | match *r { | |
39 | RChar(c) => { (i+1, vec![IChar(c)]) }, | |
40 | RSeq(box ref a, box ref b) => | |
41 | { let (ai, mut v1) = emit(a, i); | |
42 | let (bi, v2) = emit(b, ai); | |
43 | v1.push_all_move(v2); | |
44 | (bi, v1) }, | |
45 | RChc(box ref a, box ref b) => | |
46 | { let (ai, v1) = emit(a, i + 1); | |
47 | let (bi, v2) = emit(b, ai + 1); | |
48 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
49 | let jmp = vec![ IJmp(ai) ]; | |
50 | spl.push_all_move(v1); | |
51 | spl.push_all_move(jmp); | |
52 | spl.push_all_move(v2); | |
53 | (bi, spl) }, | |
54 | RRep(box ref a) => | |
55 | { let (ai, v1) = emit(a, i + 1); | |
56 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
57 | let jmp = vec![ IJmp(i) ]; | |
58 | spl.push_all_move(v1); | |
59 | spl.push_all_move(jmp); | |
60 | (ai + 1, spl) }, | |
61 | } | |
62 | } | |
63 | ||
64 | /* A wrapper over these processes */ | |
65 | pub fn compile(s: &str) -> Vec<Instr> { | |
66 | let (_, re) = parse(s); | |
67 | println!("{}", re); | |
68 | let (_, ins) = emit(&re, 0); | |
69 | println!("{}", ins); | |
70 | return ins.append([IMatch]); | |
71 | } |
1 | /* A single instruction as used in the VM-based matcher */ | |
2 | #[deriving(Clone,Show)] | |
3 | pub enum Instr { | |
4 | IChar(char), /* match a character or fail */ | |
5 | IMatch, /* match anything successfully */ | |
6 | IJmp(uint) , /* jump to instr i */ | |
7 | ISplit(uint, uint), /* try both instrs i and j */ | |
8 | } |
1 | pub use re::compile::compile; | |
2 | pub use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
3 | pub use re::recursive::eval; | |
4 | pub use re::stack::eval; | |
5 | pub mod compile; | |
6 | pub mod instruction; | |
7 | pub mod recursive; | |
8 | pub mod stack; |
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | ||
3 | /* We wrap the real evaluation function, as we're always going to | |
4 | * start executing instruction 0 with no string matched. */ | |
5 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
6 | eval1(instrs, input, 0, 0) | |
7 | } | |
8 | ||
9 | /* We use the Rust stack as our stack in this naive recursive | |
10 | * implementation. */ | |
11 | fn eval1(instrs: &[Instr], input: &str, pc: uint, cc: uint) -> bool { | |
12 | match instrs[pc] { | |
13 | IChar(_) if cc >= input.len() => return false, | |
14 | IChar(c) if c == input.char_at(cc) => | |
15 | eval1(instrs, input, pc + 1, cc + 1), | |
16 | IChar(_) => return false, | |
17 | IMatch => return true, | |
18 | IJmp(i) => eval1(instrs, input, i, cc), | |
19 | ISplit(i, _) if eval1(instrs, input, i, cc) => true, | |
20 | ISplit(_, j) => eval1(instrs, input, j, cc), | |
21 | } | |
22 | } |
1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
2 | ||
3 | /* The state of a program can be unambiguously specified by | |
4 | * a current instruction and a current position in the string. */ | |
5 | struct EvalState { pc: uint, cc: uint } | |
6 | ||
7 | /* An evaluator that maintains a manual, mutable stack for doing | |
8 | * regular-expression matching. */ | |
9 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
10 | let mut stack = vec![ EvalState {pc: 0, cc: 0} ]; | |
11 | ||
12 | while stack.len() > 0 { | |
13 | let st = stack.pop().unwrap(); | |
14 | match instrs[st.pc] { | |
15 | IChar(_) if st.cc >= input.len() => | |
16 | continue, | |
17 | IChar(c) if c == input.char_at(st.cc) => | |
18 | stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }), | |
19 | IChar(_) => | |
20 | continue, | |
21 | IMatch => | |
22 | return true, | |
23 | IJmp(i) => | |
24 | stack.push(EvalState { pc: i, cc: st.cc }), | |
25 | ISplit(i, j) => { | |
26 | stack.push(EvalState { pc: j, cc: st.cc }); | |
27 | stack.push(EvalState { pc: i, cc: st.cc }); | |
28 | }, | |
29 | } | |
30 | } | |
31 | return false; | |
32 | } |