Updated regexp to use Cargo, as well.
Getty Ritter
11 years ago
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | // use std::vec::Vec; | |
| 3 | ||
| 4 | /* A regular expression parse tree */ | |
| 5 | #[deriving(Show)] | |
| 6 | enum Regexp { | |
| 7 | RChar(char), | |
| 8 | RSeq(Box<Regexp>, Box<Regexp>), | |
| 9 | RChc(Box<Regexp>, Box<Regexp>), | |
| 10 | RRep(Box<Regexp>), | |
| 11 | } | |
| 12 | ||
| 13 | /* We're assuming a prefix regexp here. That means that we have | |
| 14 | * the following operators: | |
| 15 | * .ab => ab | |
| 16 | * |ab => a|b | |
| 17 | * *a => a* | |
| 18 | * but these nest, so (ab|c)* would become | |
| 19 | * *|c.ab | |
| 20 | * This is easier to parse. Deal with it. | |
| 21 | */ | |
| 22 | fn parse<'a>(s: &'a str) -> (&'a str, Regexp) { | |
| 23 | match s.char_at(0) { | |
| 24 | '.' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 25 | let (s2, r2) = parse(s1); | |
| 26 | (s2, RSeq(box r1, box r2)) }, | |
| 27 | '|' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 28 | let (s2, r2) = parse(s1); | |
| 29 | (s2, RChc(box r1, box r2)) }, | |
| 30 | '*' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 31 | (s1, RRep(box r1)) }, | |
| 32 | c => (s.slice_from(1), RChar(c)), | |
| 33 | } | |
| 34 | } | |
| 35 | ||
| 36 | /* Compiling an AST for regexps to the instructions */ | |
| 37 | fn emit(r: &Regexp, i: uint) -> (uint, Vec<Instr>) { | |
| 38 | match *r { | |
| 39 | RChar(c) => { (i+1, vec![IChar(c)]) }, | |
| 40 | RSeq(box ref a, box ref b) => | |
| 41 | { let (ai, mut v1) = emit(a, i); | |
| 42 | let (bi, v2) = emit(b, ai); | |
| 43 | v1.push_all_move(v2); | |
| 44 | (bi, v1) }, | |
| 45 | RChc(box ref a, box ref b) => | |
| 46 | { let (ai, v1) = emit(a, i + 1); | |
| 47 | let (bi, v2) = emit(b, ai + 1); | |
| 48 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
| 49 | let jmp = vec![ IJmp(ai) ]; | |
| 50 | spl.push_all_move(v1); | |
| 51 | spl.push_all_move(jmp); | |
| 52 | spl.push_all_move(v2); | |
| 53 | (bi, spl) }, | |
| 54 | RRep(box ref a) => | |
| 55 | { let (ai, v1) = emit(a, i + 1); | |
| 56 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
| 57 | let jmp = vec![ IJmp(i) ]; | |
| 58 | spl.push_all_move(v1); | |
| 59 | spl.push_all_move(jmp); | |
| 60 | (ai + 1, spl) }, | |
| 61 | } | |
| 62 | } | |
| 63 | ||
| 64 | /* A wrapper over these processes */ | |
| 65 | pub fn compile(s: &str) -> Vec<Instr> { | |
| 66 | let (_, re) = parse(s); | |
| 67 | println!("{}", re); | |
| 68 | let (_, ins) = emit(&re, 0); | |
| 69 | println!("{}", ins); | |
| 70 | return ins.append([IMatch]); | |
| 71 | } |
| 1 | /* A single instruction as used in the VM-based matcher */ | |
| 2 | #[deriving(Clone,Show)] | |
| 3 | pub enum Instr { | |
| 4 | IChar(char), /* match a character or fail */ | |
| 5 | IMatch, /* match anything successfully */ | |
| 6 | IJmp(uint) , /* jump to instr i */ | |
| 7 | ISplit(uint, uint), /* try both instrs i and j */ | |
| 8 | } |
| 1 | pub use re::compile::compile; | |
| 2 | pub use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 3 | pub use re::recursive::eval; | |
| 4 | pub use re::stack::eval; | |
| 5 | pub mod compile; | |
| 6 | pub mod instruction; | |
| 7 | pub mod recursive; | |
| 8 | pub mod stack; |
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | ||
| 3 | /* We wrap the real evaluation function, as we're always going to | |
| 4 | * start executing instruction 0 with no string matched. */ | |
| 5 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
| 6 | eval1(instrs, input, 0, 0) | |
| 7 | } | |
| 8 | ||
| 9 | /* We use the Rust stack as our stack in this naive recursive | |
| 10 | * implementation. */ | |
| 11 | fn eval1(instrs: &[Instr], input: &str, pc: uint, cc: uint) -> bool { | |
| 12 | match instrs[pc] { | |
| 13 | IChar(_) if cc >= input.len() => return false, | |
| 14 | IChar(c) if c == input.char_at(cc) => | |
| 15 | eval1(instrs, input, pc + 1, cc + 1), | |
| 16 | IChar(_) => return false, | |
| 17 | IMatch => return true, | |
| 18 | IJmp(i) => eval1(instrs, input, i, cc), | |
| 19 | ISplit(i, _) if eval1(instrs, input, i, cc) => true, | |
| 20 | ISplit(_, j) => eval1(instrs, input, j, cc), | |
| 21 | } | |
| 22 | } |
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | ||
| 3 | /* The state of a program can be unambiguously specified by | |
| 4 | * a current instruction and a current position in the string. */ | |
| 5 | struct EvalState { pc: uint, cc: uint } | |
| 6 | ||
| 7 | /* An evaluator that maintains a manual, mutable stack for doing | |
| 8 | * regular-expression matching. */ | |
| 9 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
| 10 | let mut stack = vec![ EvalState {pc: 0, cc: 0} ]; | |
| 11 | ||
| 12 | while stack.len() > 0 { | |
| 13 | let st = stack.pop().unwrap(); | |
| 14 | match instrs[st.pc] { | |
| 15 | IChar(_) if st.cc >= input.len() => | |
| 16 | continue, | |
| 17 | IChar(c) if c == input.char_at(st.cc) => | |
| 18 | stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }), | |
| 19 | IChar(_) => | |
| 20 | continue, | |
| 21 | IMatch => | |
| 22 | return true, | |
| 23 | IJmp(i) => | |
| 24 | stack.push(EvalState { pc: i, cc: st.cc }), | |
| 25 | ISplit(i, j) => { | |
| 26 | stack.push(EvalState { pc: j, cc: st.cc }); | |
| 27 | stack.push(EvalState { pc: i, cc: st.cc }); | |
| 28 | }, | |
| 29 | } | |
| 30 | } | |
| 31 | return false; | |
| 32 | } |
| 1 | DEPS="regexp.rs re/compile.rs re/instruction.rs re/mod.rs re/recursive.rs re/stack.rs" | |
| 2 | redo-ifchange $DEPS | |
| 3 | rustc regexp.rs -o $3 |
| 1 | /* This is a basic implementation of a regular expression matcher, | |
| 2 | * based on Henry Spencer's virtual-machine approach to regular | |
| 3 | * expression matching outlined by Russ Cox here: | |
| 4 | * http://swtch.com/~rsc/regexp/regexp2.html | |
| 5 | * | |
| 6 | * For ease of parsing, I'm using a highly non-standard Polish | |
| 7 | * notation for regular expressions, in which . and | are | |
| 8 | * prefix binary operators for catenation and choice, respectively, | |
| 9 | * and * is a prefix unary operator for repetition. */ | |
| 10 | use re::compile; | |
| 11 | mod re; | |
| 12 | ||
| 13 | fn main() { | |
| 14 | /* our sample regexp corresponds to /ab*c/ in | |
| 15 | * the usual notation. */ | |
| 16 | let re = compile("..a*bc"); | |
| 17 | println!("Recursive:"); | |
| 18 | println!(" match(re, \"abbbc\")\t== {}", | |
| 19 | ::re::recursive::eval(re.as_slice(), "abbbc")); | |
| 20 | println!(" match(re, \"ac\")\t== {}", | |
| 21 | ::re::recursive::eval(re.as_slice(), "ac")); | |
| 22 | println!(" match(re, \"abd\")\t== {}", | |
| 23 | ::re::recursive::eval(re.as_slice(), "abd")); | |
| 24 | println!("Manual Stack:"); | |
| 25 | println!(" match(re, \"abbbc\")\t== {}", | |
| 26 | ::re::stack::eval(re.as_slice(), "abbbc")); | |
| 27 | println!(" match(re, \"ac\")\t== {}", | |
| 28 | ::re::stack::eval(re.as_slice(), "ac")); | |
| 29 | println!(" match(re, \"abd\")\t== {}", | |
| 30 | ::re::stack::eval(re.as_slice(), "abd")); | |
| 31 | } |
| 1 | /* This is a basic implementation of a regular expression matcher, | |
| 2 | * based on Henry Spencer's virtual-machine approach to regular | |
| 3 | * expression matching outlined by Russ Cox here: | |
| 4 | * http://swtch.com/~rsc/regexp/regexp2.html | |
| 5 | * | |
| 6 | * For ease of parsing, I'm using a highly non-standard Polish | |
| 7 | * notation for regular expressions, in which . and | are | |
| 8 | * prefix binary operators for catenation and choice, respectively, | |
| 9 | * and * is a prefix unary operator for repetition. */ | |
| 10 | use re::compile; | |
| 11 | mod re; | |
| 12 | ||
| 13 | fn main() { | |
| 14 | /* our sample regexp corresponds to /ab*c/ in | |
| 15 | * the usual notation. */ | |
| 16 | let re = compile("..a*bc"); | |
| 17 | println!("Recursive:"); | |
| 18 | println!(" match(re, \"abbbc\")\t== {}", | |
| 19 | ::re::recursive::eval(re.as_slice(), "abbbc")); | |
| 20 | println!(" match(re, \"ac\")\t== {}", | |
| 21 | ::re::recursive::eval(re.as_slice(), "ac")); | |
| 22 | println!(" match(re, \"abd\")\t== {}", | |
| 23 | ::re::recursive::eval(re.as_slice(), "abd")); | |
| 24 | println!("Manual Stack:"); | |
| 25 | println!(" match(re, \"abbbc\")\t== {}", | |
| 26 | ::re::stack::eval(re.as_slice(), "abbbc")); | |
| 27 | println!(" match(re, \"ac\")\t== {}", | |
| 28 | ::re::stack::eval(re.as_slice(), "ac")); | |
| 29 | println!(" match(re, \"abd\")\t== {}", | |
| 30 | ::re::stack::eval(re.as_slice(), "abd")); | |
| 31 | } |
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | // use std::vec::Vec; | |
| 3 | ||
| 4 | /* A regular expression parse tree */ | |
| 5 | #[deriving(Show)] | |
| 6 | enum Regexp { | |
| 7 | RChar(char), | |
| 8 | RSeq(Box<Regexp>, Box<Regexp>), | |
| 9 | RChc(Box<Regexp>, Box<Regexp>), | |
| 10 | RRep(Box<Regexp>), | |
| 11 | } | |
| 12 | ||
| 13 | /* We're assuming a prefix regexp here. That means that we have | |
| 14 | * the following operators: | |
| 15 | * .ab => ab | |
| 16 | * |ab => a|b | |
| 17 | * *a => a* | |
| 18 | * but these nest, so (ab|c)* would become | |
| 19 | * *|c.ab | |
| 20 | * This is easier to parse. Deal with it. | |
| 21 | */ | |
| 22 | fn parse<'a>(s: &'a str) -> (&'a str, Regexp) { | |
| 23 | match s.char_at(0) { | |
| 24 | '.' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 25 | let (s2, r2) = parse(s1); | |
| 26 | (s2, RSeq(box r1, box r2)) }, | |
| 27 | '|' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 28 | let (s2, r2) = parse(s1); | |
| 29 | (s2, RChc(box r1, box r2)) }, | |
| 30 | '*' => { let (s1, r1) = parse(s.slice_from(1)); | |
| 31 | (s1, RRep(box r1)) }, | |
| 32 | c => (s.slice_from(1), RChar(c)), | |
| 33 | } | |
| 34 | } | |
| 35 | ||
| 36 | /* Compiling an AST for regexps to the instructions */ | |
| 37 | fn emit(r: &Regexp, i: uint) -> (uint, Vec<Instr>) { | |
| 38 | match *r { | |
| 39 | RChar(c) => { (i+1, vec![IChar(c)]) }, | |
| 40 | RSeq(box ref a, box ref b) => | |
| 41 | { let (ai, mut v1) = emit(a, i); | |
| 42 | let (bi, v2) = emit(b, ai); | |
| 43 | v1.push_all_move(v2); | |
| 44 | (bi, v1) }, | |
| 45 | RChc(box ref a, box ref b) => | |
| 46 | { let (ai, v1) = emit(a, i + 1); | |
| 47 | let (bi, v2) = emit(b, ai + 1); | |
| 48 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
| 49 | let jmp = vec![ IJmp(ai) ]; | |
| 50 | spl.push_all_move(v1); | |
| 51 | spl.push_all_move(jmp); | |
| 52 | spl.push_all_move(v2); | |
| 53 | (bi, spl) }, | |
| 54 | RRep(box ref a) => | |
| 55 | { let (ai, v1) = emit(a, i + 1); | |
| 56 | let mut spl = vec![ ISplit(i + 1, ai + 1) ]; | |
| 57 | let jmp = vec![ IJmp(i) ]; | |
| 58 | spl.push_all_move(v1); | |
| 59 | spl.push_all_move(jmp); | |
| 60 | (ai + 1, spl) }, | |
| 61 | } | |
| 62 | } | |
| 63 | ||
| 64 | /* A wrapper over these processes */ | |
| 65 | pub fn compile(s: &str) -> Vec<Instr> { | |
| 66 | let (_, re) = parse(s); | |
| 67 | println!("{}", re); | |
| 68 | let (_, ins) = emit(&re, 0); | |
| 69 | println!("{}", ins); | |
| 70 | return ins.append([IMatch]); | |
| 71 | } |
| 1 | /* A single instruction as used in the VM-based matcher */ | |
| 2 | #[deriving(Clone,Show)] | |
| 3 | pub enum Instr { | |
| 4 | IChar(char), /* match a character or fail */ | |
| 5 | IMatch, /* match anything successfully */ | |
| 6 | IJmp(uint) , /* jump to instr i */ | |
| 7 | ISplit(uint, uint), /* try both instrs i and j */ | |
| 8 | } |
| 1 | pub use re::compile::compile; | |
| 2 | pub use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 3 | pub use re::recursive::eval; | |
| 4 | pub use re::stack::eval; | |
| 5 | pub mod compile; | |
| 6 | pub mod instruction; | |
| 7 | pub mod recursive; | |
| 8 | pub mod stack; |
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | ||
| 3 | /* We wrap the real evaluation function, as we're always going to | |
| 4 | * start executing instruction 0 with no string matched. */ | |
| 5 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
| 6 | eval1(instrs, input, 0, 0) | |
| 7 | } | |
| 8 | ||
| 9 | /* We use the Rust stack as our stack in this naive recursive | |
| 10 | * implementation. */ | |
| 11 | fn eval1(instrs: &[Instr], input: &str, pc: uint, cc: uint) -> bool { | |
| 12 | match instrs[pc] { | |
| 13 | IChar(_) if cc >= input.len() => return false, | |
| 14 | IChar(c) if c == input.char_at(cc) => | |
| 15 | eval1(instrs, input, pc + 1, cc + 1), | |
| 16 | IChar(_) => return false, | |
| 17 | IMatch => return true, | |
| 18 | IJmp(i) => eval1(instrs, input, i, cc), | |
| 19 | ISplit(i, _) if eval1(instrs, input, i, cc) => true, | |
| 20 | ISplit(_, j) => eval1(instrs, input, j, cc), | |
| 21 | } | |
| 22 | } |
| 1 | use re::instruction::{Instr,IChar,IMatch,IJmp,ISplit}; | |
| 2 | ||
| 3 | /* The state of a program can be unambiguously specified by | |
| 4 | * a current instruction and a current position in the string. */ | |
| 5 | struct EvalState { pc: uint, cc: uint } | |
| 6 | ||
| 7 | /* An evaluator that maintains a manual, mutable stack for doing | |
| 8 | * regular-expression matching. */ | |
| 9 | pub fn eval(instrs: &[Instr], input: &str) -> bool { | |
| 10 | let mut stack = vec![ EvalState {pc: 0, cc: 0} ]; | |
| 11 | ||
| 12 | while stack.len() > 0 { | |
| 13 | let st = stack.pop().unwrap(); | |
| 14 | match instrs[st.pc] { | |
| 15 | IChar(_) if st.cc >= input.len() => | |
| 16 | continue, | |
| 17 | IChar(c) if c == input.char_at(st.cc) => | |
| 18 | stack.push(EvalState { pc: st.pc + 1, cc: st.cc + 1 }), | |
| 19 | IChar(_) => | |
| 20 | continue, | |
| 21 | IMatch => | |
| 22 | return true, | |
| 23 | IJmp(i) => | |
| 24 | stack.push(EvalState { pc: i, cc: st.cc }), | |
| 25 | ISplit(i, j) => { | |
| 26 | stack.push(EvalState { pc: j, cc: st.cc }); | |
| 27 | stack.push(EvalState { pc: i, cc: st.cc }); | |
| 28 | }, | |
| 29 | } | |
| 30 | } | |
| 31 | return false; | |
| 32 | } |