gdritter repos rust-examples / baa8e8e
Updated regexp example to Rust stable Getty Ritter 9 years ago
5 changed file(s) with 73 addition(s) and 44 deletion(s). Collapse all Expand all
1010 use re::compile;
1111 mod re;
1212
13 fn as_chars(s: &str) -> Vec<char> {
14 s.chars().collect()
15 }
16
1317 fn main() {
1418 /* our sample regexp corresponds to /ab*c/ in
1519 * the usual notation.
1620 * These two lines can be collapsed into one once
1721 * this RFC lands: https://github.com/rust-lang/rfcs/pull/66
1822 */
19 let regexp = compile("..a*bc");
20 let instrs = regexp.as_slice();
23 let instrs = &compile("..a*bc");
2124
2225 println!("Recursive:");
2326 println!(" match(re, \"abbbc\")\t== {}",
24 ::re::recursive::eval(instrs, "abbbc"));
27 ::re::recursive::eval(instrs, &as_chars("abbbc")));
2528 println!(" match(re, \"ac\")\t== {}",
26 ::re::recursive::eval(instrs, "ac"));
29 ::re::recursive::eval(instrs, &as_chars("ac")));
2730 println!(" match(re, \"abd\")\t== {}",
28 ::re::recursive::eval(instrs, "abd"));
31 ::re::recursive::eval(instrs, &as_chars("abd")));
2932
3033 println!("Manual Stack:");
3134 println!(" match(re, \"abbbc\")\t== {}",
32 ::re::stack::eval(instrs, "abbbc"));
35 ::re::stack::eval(instrs, &as_chars("abbbc")));
3336 println!(" match(re, \"ac\")\t== {}",
34 ::re::stack::eval(instrs, "ac"));
37 ::re::stack::eval(instrs, &as_chars("ac")));
3538 println!(" match(re, \"abd\")\t== {}",
36 ::re::stack::eval(instrs, "abd"));
39 ::re::stack::eval(instrs, &as_chars("abd")));
3740 }
11 use re::instruction::Instr;
2 // use std::vec::Vec;
2 use std::str::Chars;
33
44 /* A regular expression parse tree */
5 #[deriving(Show)]
5 #[derive(Debug)]
66 enum Regexp {
77 Char(char),
88 Seq(Box<Regexp>, Box<Regexp>),
99 Chc(Box<Regexp>, Box<Regexp>),
1010 Rep(Box<Regexp>),
11 }
12
13 fn chr(c: char) -> Box<Regexp> {
14 Box::new(Regexp::Char(c))
15 }
16
17 fn seq(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> {
18 Box::new(Regexp::Seq(l, r))
19 }
20
21 fn chc(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> {
22 Box::new(Regexp::Chc(l, r))
23 }
24
25 fn rep(x: Box<Regexp>) -> Box<Regexp> {
26 Box::new(Regexp::Rep(x))
1127 }
1228
1329 /* We're assuming a prefix regexp here. That means that we have
1935 * *|c.ab
2036 * This is easier to parse. Deal with it.
2137 */
22 fn parse<'a>(s: &'a str) -> (&'a str, Regexp) {
23 match s.char_at(0) {
24 '.' => { let (s1, r1) = parse(s.slice_from(1));
25 let (s2, r2) = parse(s1);
26 (s2, Regexp::Seq(box r1, box r2)) },
27 '|' => { let (s1, r1) = parse(s.slice_from(1));
28 let (s2, r2) = parse(s1);
29 (s2, Regexp::Chc(box r1, box r2)) },
30 '*' => { let (s1, r1) = parse(s.slice_from(1));
31 (s1, Regexp::Rep(box r1)) },
32 c => (s.slice_from(1), Regexp::Char(c)),
38 fn parse<'a>(s: &'a mut Chars<'a>) -> (&'a mut Chars<'a>, Box<Regexp>) {
39 match s.next() {
40 Some('.') => { let (s1, r1) = parse(s);
41 let (s2, r2) = parse(s1);
42 (s2, seq(r1, r2)) },
43 Some('|') => { let (s1, r1) = parse(s);
44 let (s2, r2) = parse(s1);
45 (s2, chc(r1, r2)) },
46 Some('*') => { let (s1, r1) = parse(s);
47 (s1, rep(r1)) },
48 Some(c) => (s, chr(c)),
49 None => panic!("Unexpected EOF"),
50 }
51 }
52
53 /* This should eventually be added to a stable API, but right now
54 * isn't available in the stable stdlib. */
55 fn push_all<'a, A, I>(target: &mut Vec<A>, source: I)
56 where A: Clone + 'a, I: Iterator<Item=&'a A> {
57 for x in source {
58 target.push(x.clone());
3359 }
3460 }
3561
3864 * vector (so that subsequent instructions to be added
3965 * know what pc to use) and the vector of instructions.
4066 */
41 fn emit(regexp: &Regexp, pc: uint) -> (uint, Vec<Instr>) {
67 fn emit(regexp: &Regexp, pc: usize) -> (usize, Vec<Instr>) {
4268 match *regexp {
4369 /* For a match, we produce this code:
4470 * ---- <- pc
5379 * | [[ second ]]
5480 * ---- <- second_pc
5581 */
56 Regexp::Seq(box ref first, box ref second) =>
82 Regexp::Seq(ref first, ref second) =>
5783 { let (first_pc, mut v1) = emit(first, pc);
5884 let (second_pc, v2) = emit(second, first_pc);
59 v1.push_all(v2.as_slice());
85 push_all(&mut v1, v2.iter());
6086 (second_pc, v1)
6187 },
6288 /* For a choice, we produce this code:
7096 * | [[ second ]]
7197 * ---- <- second_pc
7298 */
73 Regexp::Chc(box ref first, box ref second) =>
99 Regexp::Chc(ref first, ref second) =>
74100 { let (first_pc, v1) = emit(first, pc + 1);
75101 let (second_pc, v2) = emit(second, first_pc + 1);
76102 let mut split_instr = vec![ Instr::Split(pc + 1, first_pc + 1) ];
77103 let jmp_instr = vec![ Instr::Jmp(second_pc) ];
78 split_instr.push_all(v1.as_slice());
79 split_instr.push_all(jmp_instr.as_slice());
80 split_instr.push_all(v2.as_slice());
104 push_all(&mut split_instr, v1.iter());
105 push_all(&mut split_instr, jmp_instr.iter());
106 push_all(&mut split_instr, v2.iter());
81107 (second_pc, split_instr)
82108 },
83109 /* For a repetition, we produce this code:
89115 * | IJmp(pc)
90116 * ---- <- expr_pc + 1
91117 */
92 Regexp::Rep(box ref expr) =>
118 Regexp::Rep(ref expr) =>
93119 { let (expr_pc, v1) = emit(expr, pc + 1);
94120 let mut spl = vec![ Instr::Split(pc + 1, expr_pc + 1) ];
95121 let jmp = vec![ Instr::Jmp(pc) ];
96 spl.push_all(v1.as_slice());
97 spl.push_all(jmp.as_slice());
122 push_all(&mut spl, v1.iter());
123 push_all(&mut spl, jmp.iter());
98124 (expr_pc + 1, spl)
99125 },
100126 }
102128
103129 /* A wrapper over these processes */
104130 pub fn compile(s: &str) -> Vec<Instr> {
105 let (_, re) = parse(s);
106 println!("{}", re);
131 let (_, re) = parse(&mut s.chars());
132 println!("{:?}", re);
107133 let (_, mut ins) = emit(&re, 0);
108 println!("{}", ins);
134 println!("{:?}", ins);
109135 /* If we get to the end of a compiled regular expression,
110136 * that means it hasn't aborted and we can match.
111137 */
11 /* A single instruction as used in the VM-based matcher */
2 #[deriving(Clone,Show)]
2 #[derive(Clone,Debug)]
33 pub enum Instr {
44 Char(char), /* match a character or fail */
55 Match, /* match anything successfully */
6 Jmp(uint) , /* jump to instr i */
7 Split(uint, uint), /* try both instrs i and j */
8 }
6 Jmp(usize), /* jump to instr i */
7 Split(usize, usize), /* try both instrs i and j */
8 }
22
33 /* We wrap the real evaluation function, as we're always going to
44 * start executing instruction 0 with no string matched. */
5 pub fn eval(instrs: &[Instr], input: &str) -> bool {
5 pub fn eval(instrs: &[Instr], input: &[char]) -> bool {
66 eval1(instrs, input, 0, 0)
77 }
88
1111 * a string we're matching over, the current program counter
1212 * in the instructions, and the current point to which we've
1313 * traversed the string. */
14 fn eval1(instrs: &[Instr], input: &str, pc: uint, cc: uint) -> bool {
14 fn eval1(instrs: &[Instr], input: &[char], pc: usize, cc: usize) -> bool {
1515 match instrs[pc] {
1616 Instr::Char(_) if cc >= input.len() => return false,
17 Instr::Char(c) if c == input.char_at(cc) =>
17 Instr::Char(c) if c == input[cc] =>
1818 eval1(instrs, input, pc + 1, cc + 1),
1919 Instr::Char(_) => return false,
2020 Instr::Match => return true,
22
33 /* The state of a program can be unambiguously specified by
44 * a current instruction and a current position in the string. */
5 struct EvalState { pc: uint, cc: uint }
5 struct EvalState { pc: usize, cc: usize }
66
77 /* An evaluator that maintains a manual, mutable stack for doing
88 * regular-expression matching. */
9 pub fn eval(instrs: &[Instr], input: &str) -> bool {
9 pub fn eval(instrs: &[Instr], input: &[char]) -> bool {
1010 let mut stack = vec![ EvalState {pc: 0, cc: 0} ];
1111
1212 /* Every time we find that a possibility is impossible, we
2121 match instrs[st.pc] {
2222 Instr::Char(_) if st.cc >= input.len() =>
2323 continue,
24 Instr::Char(c) if c == input.char_at(st.cc) =>
24 Instr::Char(c) if c == input[st.cc] =>
2525 stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }),
2626 Instr::Char(_) =>
2727 continue,