| 1 | 1 |
use re::instruction::Instr;
|
| 2 | |
// use std::vec::Vec;
|
| 2 |
use std::str::Chars;
|
| 3 | 3 |
|
| 4 | 4 |
/* A regular expression parse tree */
|
| 5 | |
#[deriving(Show)]
|
| 5 |
#[derive(Debug)]
|
| 6 | 6 |
enum Regexp {
|
| 7 | 7 |
Char(char),
|
| 8 | 8 |
Seq(Box<Regexp>, Box<Regexp>),
|
| 9 | 9 |
Chc(Box<Regexp>, Box<Regexp>),
|
| 10 | 10 |
Rep(Box<Regexp>),
|
| 11 |
}
|
| 12 |
|
| 13 |
fn chr(c: char) -> Box<Regexp> {
|
| 14 |
Box::new(Regexp::Char(c))
|
| 15 |
}
|
| 16 |
|
| 17 |
fn seq(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> {
|
| 18 |
Box::new(Regexp::Seq(l, r))
|
| 19 |
}
|
| 20 |
|
| 21 |
fn chc(l: Box<Regexp>, r: Box<Regexp>) -> Box<Regexp> {
|
| 22 |
Box::new(Regexp::Chc(l, r))
|
| 23 |
}
|
| 24 |
|
| 25 |
fn rep(x: Box<Regexp>) -> Box<Regexp> {
|
| 26 |
Box::new(Regexp::Rep(x))
|
| 11 | 27 |
}
|
| 12 | 28 |
|
| 13 | 29 |
/* We're assuming a prefix regexp here. That means that we have
|
|
| 19 | 35 |
* *|c.ab
|
| 20 | 36 |
* This is easier to parse. Deal with it.
|
| 21 | 37 |
*/
|
| 22 | |
fn parse<'a>(s: &'a str) -> (&'a str, Regexp) {
|
| 23 | |
match s.char_at(0) {
|
| 24 | |
'.' => { let (s1, r1) = parse(s.slice_from(1));
|
| 25 | |
let (s2, r2) = parse(s1);
|
| 26 | |
(s2, Regexp::Seq(box r1, box r2)) },
|
| 27 | |
'|' => { let (s1, r1) = parse(s.slice_from(1));
|
| 28 | |
let (s2, r2) = parse(s1);
|
| 29 | |
(s2, Regexp::Chc(box r1, box r2)) },
|
| 30 | |
'*' => { let (s1, r1) = parse(s.slice_from(1));
|
| 31 | |
(s1, Regexp::Rep(box r1)) },
|
| 32 | |
c => (s.slice_from(1), Regexp::Char(c)),
|
| 38 |
fn parse<'a>(s: &'a mut Chars<'a>) -> (&'a mut Chars<'a>, Box<Regexp>) {
|
| 39 |
match s.next() {
|
| 40 |
Some('.') => { let (s1, r1) = parse(s);
|
| 41 |
let (s2, r2) = parse(s1);
|
| 42 |
(s2, seq(r1, r2)) },
|
| 43 |
Some('|') => { let (s1, r1) = parse(s);
|
| 44 |
let (s2, r2) = parse(s1);
|
| 45 |
(s2, chc(r1, r2)) },
|
| 46 |
Some('*') => { let (s1, r1) = parse(s);
|
| 47 |
(s1, rep(r1)) },
|
| 48 |
Some(c) => (s, chr(c)),
|
| 49 |
None => panic!("Unexpected EOF"),
|
| 50 |
}
|
| 51 |
}
|
| 52 |
|
| 53 |
/* This should eventually be added to a stable API, but right now
|
| 54 |
* isn't available in the stable stdlib. */
|
| 55 |
fn push_all<'a, A, I>(target: &mut Vec<A>, source: I)
|
| 56 |
where A: Clone + 'a, I: Iterator<Item=&'a A> {
|
| 57 |
for x in source {
|
| 58 |
target.push(x.clone());
|
| 33 | 59 |
}
|
| 34 | 60 |
}
|
| 35 | 61 |
|
|
| 38 | 64 |
* vector (so that subsequent instructions to be added
|
| 39 | 65 |
* know what pc to use) and the vector of instructions.
|
| 40 | 66 |
*/
|
| 41 | |
fn emit(regexp: &Regexp, pc: uint) -> (uint, Vec<Instr>) {
|
| 67 |
fn emit(regexp: &Regexp, pc: usize) -> (usize, Vec<Instr>) {
|
| 42 | 68 |
match *regexp {
|
| 43 | 69 |
/* For a match, we produce this code:
|
| 44 | 70 |
* ---- <- pc
|
|
| 53 | 79 |
* | [[ second ]]
|
| 54 | 80 |
* ---- <- second_pc
|
| 55 | 81 |
*/
|
| 56 | |
Regexp::Seq(box ref first, box ref second) =>
|
| 82 |
Regexp::Seq(ref first, ref second) =>
|
| 57 | 83 |
{ let (first_pc, mut v1) = emit(first, pc);
|
| 58 | 84 |
let (second_pc, v2) = emit(second, first_pc);
|
| 59 | |
v1.push_all(v2.as_slice());
|
| 85 |
push_all(&mut v1, v2.iter());
|
| 60 | 86 |
(second_pc, v1)
|
| 61 | 87 |
},
|
| 62 | 88 |
/* For a choice, we produce this code:
|
|
| 70 | 96 |
* | [[ second ]]
|
| 71 | 97 |
* ---- <- second_pc
|
| 72 | 98 |
*/
|
| 73 | |
Regexp::Chc(box ref first, box ref second) =>
|
| 99 |
Regexp::Chc(ref first, ref second) =>
|
| 74 | 100 |
{ let (first_pc, v1) = emit(first, pc + 1);
|
| 75 | 101 |
let (second_pc, v2) = emit(second, first_pc + 1);
|
| 76 | 102 |
let mut split_instr = vec![ Instr::Split(pc + 1, first_pc + 1) ];
|
| 77 | 103 |
let jmp_instr = vec![ Instr::Jmp(second_pc) ];
|
| 78 | |
split_instr.push_all(v1.as_slice());
|
| 79 | |
split_instr.push_all(jmp_instr.as_slice());
|
| 80 | |
split_instr.push_all(v2.as_slice());
|
| 104 |
push_all(&mut split_instr, v1.iter());
|
| 105 |
push_all(&mut split_instr, jmp_instr.iter());
|
| 106 |
push_all(&mut split_instr, v2.iter());
|
| 81 | 107 |
(second_pc, split_instr)
|
| 82 | 108 |
},
|
| 83 | 109 |
/* For a repetition, we produce this code:
|
|
| 89 | 115 |
* | IJmp(pc)
|
| 90 | 116 |
* ---- <- expr_pc + 1
|
| 91 | 117 |
*/
|
| 92 | |
Regexp::Rep(box ref expr) =>
|
| 118 |
Regexp::Rep(ref expr) =>
|
| 93 | 119 |
{ let (expr_pc, v1) = emit(expr, pc + 1);
|
| 94 | 120 |
let mut spl = vec![ Instr::Split(pc + 1, expr_pc + 1) ];
|
| 95 | 121 |
let jmp = vec![ Instr::Jmp(pc) ];
|
| 96 | |
spl.push_all(v1.as_slice());
|
| 97 | |
spl.push_all(jmp.as_slice());
|
| 122 |
push_all(&mut spl, v1.iter());
|
| 123 |
push_all(&mut spl, jmp.iter());
|
| 98 | 124 |
(expr_pc + 1, spl)
|
| 99 | 125 |
},
|
| 100 | 126 |
}
|
|
| 102 | 128 |
|
| 103 | 129 |
/* A wrapper over these processes */
|
| 104 | 130 |
pub fn compile(s: &str) -> Vec<Instr> {
|
| 105 | |
let (_, re) = parse(s);
|
| 106 | |
println!("{}", re);
|
| 131 |
let (_, re) = parse(&mut s.chars());
|
| 132 |
println!("{:?}", re);
|
| 107 | 133 |
let (_, mut ins) = emit(&re, 0);
|
| 108 | |
println!("{}", ins);
|
| 134 |
println!("{:?}", ins);
|
| 109 | 135 |
/* If we get to the end of a compiled regular expression,
|
| 110 | 136 |
* that means it hasn't aborted and we can match.
|
| 111 | 137 |
*/
|