Updated regexp example to Rust stable
Getty Ritter
9 years ago
10 | 10 | use re::compile; |
11 | 11 | mod re; |
12 | 12 | |
13 | fn as_chars(s: &str) -> Vec<char> { | |
14 | s.chars().collect() | |
15 | } | |
16 | ||
13 | 17 | fn main() { |
14 | 18 | /* our sample regexp corresponds to /ab*c/ in |
15 | 19 | * the usual notation. |
16 | 20 | * These two lines can be collapsed into one once |
17 | 21 | * this RFC lands: https://github.com/rust-lang/rfcs/pull/66 |
18 | 22 | */ |
19 | let regexp = compile("..a*bc"); | |
20 | let instrs = regexp.as_slice(); | |
23 | let instrs = &compile("..a*bc"); | |
21 | 24 | |
22 | 25 | println!("Recursive:"); |
23 | 26 | println!(" match(re, \"abbbc\")\t== {}", |
24 |
::re::recursive::eval(instrs, |
|
27 | ::re::recursive::eval(instrs, &as_chars("abbbc"))); | |
25 | 28 | println!(" match(re, \"ac\")\t== {}", |
26 |
::re::recursive::eval(instrs, |
|
29 | ::re::recursive::eval(instrs, &as_chars("ac"))); | |
27 | 30 | println!(" match(re, \"abd\")\t== {}", |
28 |
::re::recursive::eval(instrs, |
|
31 | ::re::recursive::eval(instrs, &as_chars("abd"))); | |
29 | 32 | |
30 | 33 | println!("Manual Stack:"); |
31 | 34 | println!(" match(re, \"abbbc\")\t== {}", |
32 |
::re::stack::eval(instrs, |
|
35 | ::re::stack::eval(instrs, &as_chars("abbbc"))); | |
33 | 36 | println!(" match(re, \"ac\")\t== {}", |
34 |
::re::stack::eval(instrs, |
|
37 | ::re::stack::eval(instrs, &as_chars("ac"))); | |
35 | 38 | println!(" match(re, \"abd\")\t== {}", |
36 |
::re::stack::eval(instrs, |
|
39 | ::re::stack::eval(instrs, &as_chars("abd"))); | |
37 | 40 | } |
1 | 1 | use re::instruction::Instr; |
2 |
|
|
2 | use std::str::Chars; | |
3 | 3 | |
4 | 4 | /* A regular expression parse tree */ |
5 |
#[deriv |
|
5 | #[derive(Debug)] | |
6 | 6 | enum Regexp { |
7 | 7 | Char(char), |
8 | 8 | Seq(Box<Regexp>, Box<Regexp>), |
9 | 9 | Chc(Box<Regexp>, Box<Regexp>), |
10 | 10 | Rep(Box<Regexp>), |
11 | } | |
12 | ||
13 | fn chr(c: char) -> Box<Regexp> { | |
14 | Box::new(Regexp::Char(c)) | |
15 | } | |
16 | ||
17 | fn seq(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> { | |
18 | Box::new(Regexp::Seq(l, r)) | |
19 | } | |
20 | ||
21 | fn chc(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> { | |
22 | Box::new(Regexp::Chc(l, r)) | |
23 | } | |
24 | ||
25 | fn rep(x: Box<Regexp>) -> Box<Regexp> { | |
26 | Box::new(Regexp::Rep(x)) | |
11 | 27 | } |
12 | 28 | |
13 | 29 | /* We're assuming a prefix regexp here. That means that we have |
19 | 35 | * *|c.ab |
20 | 36 | * This is easier to parse. Deal with it. |
21 | 37 | */ |
22 | fn parse<'a>(s: &'a str) -> (&'a str, Regexp) { | |
23 | match s.char_at(0) { | |
24 | '.' => { let (s1, r1) = parse(s.slice_from(1)); | |
25 | let (s2, r2) = parse(s1); | |
26 | (s2, Regexp::Seq(box r1, box r2)) }, | |
27 | '|' => { let (s1, r1) = parse(s.slice_from(1)); | |
28 | let (s2, r2) = parse(s1); | |
29 | (s2, Regexp::Chc(box r1, box r2)) }, | |
30 | '*' => { let (s1, r1) = parse(s.slice_from(1)); | |
31 | (s1, Regexp::Rep(box r1)) }, | |
32 | c => (s.slice_from(1), Regexp::Char(c)), | |
38 | fn parse<'a>(s: &'a mut Chars<'a>) -> (&'a mut Chars<'a>, Box<Regexp>) { | |
39 | match s.next() { | |
40 | Some('.') => { let (s1, r1) = parse(s); | |
41 | let (s2, r2) = parse(s1); | |
42 | (s2, seq(r1, r2)) }, | |
43 | Some('|') => { let (s1, r1) = parse(s); | |
44 | let (s2, r2) = parse(s1); | |
45 | (s2, chc(r1, r2)) }, | |
46 | Some('*') => { let (s1, r1) = parse(s); | |
47 | (s1, rep(r1)) }, | |
48 | Some(c) => (s, chr(c)), | |
49 | None => panic!("Unexpected EOF"), | |
50 | } | |
51 | } | |
52 | ||
53 | /* This should eventually be added to a stable API, but right now | |
54 | * isn't available in the stable stdlib. */ | |
55 | fn push_all<'a, A, I>(target: &mut Vec<A>, source: I) | |
56 | where A: Clone + 'a, I: Iterator<Item=&'a A> { | |
57 | for x in source { | |
58 | target.push(x.clone()); | |
33 | 59 | } |
34 | 60 | } |
35 | 61 | |
38 | 64 | * vector (so that subsequent instructions to be added |
39 | 65 | * know what pc to use) and the vector of instructions. |
40 | 66 | */ |
41 |
fn emit(regexp: &Regexp, pc: u |
|
67 | fn emit(regexp: &Regexp, pc: usize) -> (usize, Vec<Instr>) { | |
42 | 68 | match *regexp { |
43 | 69 | /* For a match, we produce this code: |
44 | 70 | * ---- <- pc |
53 | 79 | * | [[ second ]] |
54 | 80 | * ---- <- second_pc |
55 | 81 | */ |
56 |
Regexp::Seq( |
|
82 | Regexp::Seq(ref first, ref second) => | |
57 | 83 | { let (first_pc, mut v1) = emit(first, pc); |
58 | 84 | let (second_pc, v2) = emit(second, first_pc); |
59 |
|
|
85 | push_all(&mut v1, v2.iter()); | |
60 | 86 | (second_pc, v1) |
61 | 87 | }, |
62 | 88 | /* For a choice, we produce this code: |
70 | 96 | * | [[ second ]] |
71 | 97 | * ---- <- second_pc |
72 | 98 | */ |
73 |
Regexp::Chc( |
|
99 | Regexp::Chc(ref first, ref second) => | |
74 | 100 | { let (first_pc, v1) = emit(first, pc + 1); |
75 | 101 | let (second_pc, v2) = emit(second, first_pc + 1); |
76 | 102 | let mut split_instr = vec![ Instr::Split(pc + 1, first_pc + 1) ]; |
77 | 103 | let jmp_instr = vec![ Instr::Jmp(second_pc) ]; |
78 | split_instr.push_all(v1.as_slice()); | |
79 | split_instr.push_all(jmp_instr.as_slice()); | |
80 |
|
|
104 | push_all(&mut split_instr, v1.iter()); | |
105 | push_all(&mut split_instr, jmp_instr.iter()); | |
106 | push_all(&mut split_instr, v2.iter()); | |
81 | 107 | (second_pc, split_instr) |
82 | 108 | }, |
83 | 109 | /* For a repetition, we produce this code: |
89 | 115 | * | IJmp(pc) |
90 | 116 | * ---- <- expr_pc + 1 |
91 | 117 | */ |
92 |
Regexp::Rep( |
|
118 | Regexp::Rep(ref expr) => | |
93 | 119 | { let (expr_pc, v1) = emit(expr, pc + 1); |
94 | 120 | let mut spl = vec![ Instr::Split(pc + 1, expr_pc + 1) ]; |
95 | 121 | let jmp = vec![ Instr::Jmp(pc) ]; |
96 | spl.push_all(v1.as_slice()); | |
97 | spl.push_all(jmp.as_slice()); | |
122 | push_all(&mut spl, v1.iter()); | |
123 | push_all(&mut spl, jmp.iter()); | |
98 | 124 | (expr_pc + 1, spl) |
99 | 125 | }, |
100 | 126 | } |
102 | 128 | |
103 | 129 | /* A wrapper over these processes */ |
104 | 130 | pub fn compile(s: &str) -> Vec<Instr> { |
105 | let (_, re) = parse(s); | |
106 | println!("{}", re); | |
131 | let (_, re) = parse(&mut s.chars()); | |
132 | println!("{:?}", re); | |
107 | 133 | let (_, mut ins) = emit(&re, 0); |
108 |
println!("{ |
|
134 | println!("{:?}", ins); | |
109 | 135 | /* If we get to the end of a compiled regular expression, |
110 | 136 | * that means it hasn't aborted and we can match. |
111 | 137 | */ |
1 | 1 | /* A single instruction as used in the VM-based matcher */ |
2 |
#[deriv |
|
2 | #[derive(Clone,Debug)] | |
3 | 3 | pub enum Instr { |
4 | 4 | Char(char), /* match a character or fail */ |
5 | 5 | Match, /* match anything successfully */ |
6 | Jmp(uint) , /* jump to instr i */ | |
7 | Split(uint, uint), /* try both instrs i and j */ | |
8 | }⏎ | |
6 | Jmp(usize), /* jump to instr i */ | |
7 | Split(usize, usize), /* try both instrs i and j */ | |
8 | } |
2 | 2 | |
3 | 3 | /* We wrap the real evaluation function, as we're always going to |
4 | 4 | * start executing instruction 0 with no string matched. */ |
5 |
pub fn eval(instrs: &[Instr], input: & |
|
5 | pub fn eval(instrs: &[Instr], input: &[char]) -> bool { | |
6 | 6 | eval1(instrs, input, 0, 0) |
7 | 7 | } |
8 | 8 | |
11 | 11 | * a string we're matching over, the current program counter |
12 | 12 | * in the instructions, and the current point to which we've |
13 | 13 | * traversed the string. */ |
14 |
fn eval1(instrs: &[Instr], input: & |
|
14 | fn eval1(instrs: &[Instr], input: &[char], pc: usize, cc: usize) -> bool { | |
15 | 15 | match instrs[pc] { |
16 | 16 | Instr::Char(_) if cc >= input.len() => return false, |
17 |
Instr::Char(c) if c == input |
|
17 | Instr::Char(c) if c == input[cc] => | |
18 | 18 | eval1(instrs, input, pc + 1, cc + 1), |
19 | 19 | Instr::Char(_) => return false, |
20 | 20 | Instr::Match => return true, |
2 | 2 | |
3 | 3 | /* The state of a program can be unambiguously specified by |
4 | 4 | * a current instruction and a current position in the string. */ |
5 |
struct EvalState { pc: u |
|
5 | struct EvalState { pc: usize, cc: usize } | |
6 | 6 | |
7 | 7 | /* An evaluator that maintains a manual, mutable stack for doing |
8 | 8 | * regular-expression matching. */ |
9 |
pub fn eval(instrs: &[Instr], input: & |
|
9 | pub fn eval(instrs: &[Instr], input: &[char]) -> bool { | |
10 | 10 | let mut stack = vec![ EvalState {pc: 0, cc: 0} ]; |
11 | 11 | |
12 | 12 | /* Every time we find that a possibility is impossible, we |
21 | 21 | match instrs[st.pc] { |
22 | 22 | Instr::Char(_) if st.cc >= input.len() => |
23 | 23 | continue, |
24 |
Instr::Char(c) if c == input |
|
24 | Instr::Char(c) if c == input[st.cc] => | |
25 | 25 | stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }), |
26 | 26 | Instr::Char(_) => |
27 | 27 | continue, |