From 1065ea56092fe376c52a7625eb0d49bc333c5f21 Mon Sep 17 00:00:00 2001 From: GabrieleNunez Date: Wed, 5 Aug 2015 15:28:19 -0400 Subject: [PATCH] - Moved Regular Expresion generation into function generateCensorCheck and storing in variable $this->censorChecks; - At the beginning of function censorString now doing a check to see if $this->censorChecks is generated. More performant in iteration heavy scenarios --- src/CensorWords.php | 102 ++++++++++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 42 deletions(-) diff --git a/src/CensorWords.php b/src/CensorWords.php index 9d5f190..819aa99 100644 --- a/src/CensorWords.php +++ b/src/CensorWords.php @@ -4,6 +4,12 @@ class CensorWords { + /* + * When the dictionary is loaded, a ton of regular expression strings are generated + * These regular expressions are used to perform the profanity checks. + * Store them here so when we call censorString we don't need to regenerate them on every call + */ + private $censorChecks = null; public function __construct() { $this->replacer = '*'; @@ -45,7 +51,6 @@ public function setDictionary($dictionary) { } } $this->badwords = $badwords; - } @@ -72,7 +77,55 @@ public function randCensor($chars, $len) { substr($chars, 0, ($len%strlen($chars)))); } + + /** + * Generates the regular expressions that are going to be used to check for profanity + * @param boolean $fullWords Option to generate regular expressions used for full words instead. Default is false + * void + */ + private function generateCensorChecks($fullWords = false) { + + $badwords = $this->badwords; + + // generate censor checks as soon as we load the dictionary + // utilize leet equivalents as well + $leet_replace = array(); + $leet_replace['a']= '(a|a\.|a\-|4|@|Á|á|À|Â|à|Â|â|Ä|ä|Ã|ã|Å|å|α|Δ|Λ|λ)'; + $leet_replace['b']= '(b|b\.|b\-|8|\|3|ß|Β|β)'; + $leet_replace['c']= '(c|c\.|c\-|Ç|ç|¢|€|<|\(|{|©)'; + $leet_replace['d']= '(d|d\.|d\-|∂|\|\)|Þ|þ|Ð|ð)'; + $leet_replace['e']= '(e|e\.|e\-|3|€|È|è|É|é|Ê|ê|∑)'; + $leet_replace['f']= '(f|f\.|f\-|ƒ)'; + $leet_replace['g']= '(g|g\.|g\-|6|9)'; + $leet_replace['h']= '(h|h\.|h\-|Η)'; + $leet_replace['i']= '(i|i\.|i\-|!|\||\]\[|]|1|∫|Ì|Í|Î|Ï|ì|í|î|ï)'; + $leet_replace['j']= '(j|j\.|j\-)'; + $leet_replace['k']= '(k|k\.|k\-|Κ|κ)'; + $leet_replace['l']= '(l|1\.|l\-|!|\||\]\[|]|£|∫|Ì|Í|Î|Ï)'; + $leet_replace['m']= '(m|m\.|m\-)'; + $leet_replace['n']= '(n|n\.|n\-|η|Ν|Π)'; + $leet_replace['o']= '(o|o\.|o\-|0|Ο|ο|Φ|¤|°|ø)'; + $leet_replace['p']= '(p|p\.|p\-|ρ|Ρ|¶|þ)'; + $leet_replace['q']= '(q|q\.|q\-)'; + $leet_replace['r']= '(r|r\.|r\-|®)'; + $leet_replace['s']= '(s|s\.|s\-|5|\$|§)'; + $leet_replace['t']= '(t|t\.|t\-|Τ|τ)'; + $leet_replace['u']= '(u|u\.|u\-|υ|µ)'; + $leet_replace['v']= '(v|v\.|v\-|υ|ν)'; + $leet_replace['w']= '(w|w\.|w\-|ω|ψ|Ψ)'; + $leet_replace['x']= '(x|x\.|x\-|Χ|χ)'; + $leet_replace['y']= '(y|y\.|y\-|¥|γ|ÿ|ý|Ÿ|Ý)'; + $leet_replace['z']= '(z|z\.|z\-|Ζ)'; + $censorChecks = array(); + for ($x=0; $xcensorChecks = $censorChecks; + + } /** * Apply censorship to $string, replacing $badwords with $censorChar. @@ -81,53 +134,18 @@ public function randCensor($chars, $len) { * string[string] */ public function censorString($string, $fullWords = false) { - $badwords = $this->badwords; - $anThis = &$this; - $leet_replace = array(); - $leet_replace['a']= '(a|a\.|a\-|4|@|Á|á|À|Â|à|Â|â|Ä|ä|Ã|ã|Å|å|α|Δ|Λ|λ)'; - $leet_replace['b']= '(b|b\.|b\-|8|\|3|ß|Β|β)'; - $leet_replace['c']= '(c|c\.|c\-|Ç|ç|¢|€|<|\(|{|©)'; - $leet_replace['d']= '(d|d\.|d\-|∂|\|\)|Þ|þ|Ð|ð)'; - $leet_replace['e']= '(e|e\.|e\-|3|€|È|è|É|é|Ê|ê|∑)'; - $leet_replace['f']= '(f|f\.|f\-|ƒ)'; - $leet_replace['g']= '(g|g\.|g\-|6|9)'; - $leet_replace['h']= '(h|h\.|h\-|Η)'; - $leet_replace['i']= '(i|i\.|i\-|!|\||\]\[|]|1|∫|Ì|Í|Î|Ï|ì|í|î|ï)'; - $leet_replace['j']= '(j|j\.|j\-)'; - $leet_replace['k']= '(k|k\.|k\-|Κ|κ)'; - $leet_replace['l']= '(l|1\.|l\-|!|\||\]\[|]|£|∫|Ì|Í|Î|Ï)'; - $leet_replace['m']= '(m|m\.|m\-)'; - $leet_replace['n']= '(n|n\.|n\-|η|Ν|Π)'; - $leet_replace['o']= '(o|o\.|o\-|0|Ο|ο|Φ|¤|°|ø)'; - $leet_replace['p']= '(p|p\.|p\-|ρ|Ρ|¶|þ)'; - $leet_replace['q']= '(q|q\.|q\-)'; - $leet_replace['r']= '(r|r\.|r\-|®)'; - $leet_replace['s']= '(s|s\.|s\-|5|\$|§)'; - $leet_replace['t']= '(t|t\.|t\-|Τ|τ)'; - $leet_replace['u']= '(u|u\.|u\-|υ|µ)'; - $leet_replace['v']= '(v|v\.|v\-|υ|ν)'; - $leet_replace['w']= '(w|w\.|w\-|ω|ψ|Ψ)'; - $leet_replace['x']= '(x|x\.|x\-|Χ|χ)'; - $leet_replace['y']= '(y|y\.|y\-|¥|γ|ÿ|ý|Ÿ|Ý)'; - $leet_replace['z']= '(z|z\.|z\-|Ζ)'; - - $words = explode(" ", $string); - - for ($x=0; $xcensorChecks) + $this->generateCensorChecks($fullWords); + + $anThis = &$this; $counter=0; $match = array(); $newstring = array(); $newstring['orig'] = html_entity_decode($string); // $anThis for <= PHP5.3 - $newstring['clean'] = preg_replace_callback($badwords, function($matches) use (&$anThis,&$counter,&$match) { + $newstring['clean'] = preg_replace_callback($this->censorChecks, function($matches) use (&$anThis,&$counter,&$match) { $match[$counter++] = $matches[0]; // is $anThis->replacer a single char?