diff --git a/README.md b/README.md index 38a1a8a..4dc9f7b 100644 --- a/README.md +++ b/README.md @@ -47,11 +47,11 @@ will output: Fédération Camerounaise de Football Fédération Camerounaise de Football -Options: +Flags: ======== By default, `Encoding::fixUTF8` will use the `Encoding::WITHOUT_ICONV` flag, signalling that iconv should not be used to fix garbled UTF8 strings. -This class also provides options for iconv processing, such as `Encoding::ICONV_TRANSLIT` and `Encoding::ICONV_IGNORE` to enable these flags when the iconv class is utilized. The functionality of such flags are documented in the [PHP iconv documentation](http://php.net/manual/en/function.iconv.php). +This class also provides flags for iconv processing, such as `Encoding::ICONV_TRANSLIT` and `Encoding::ICONV_IGNORE` to enable these flags when the iconv class is utilized. The functionality of such flags are documented in the [PHP iconv documentation](http://php.net/manual/en/function.iconv.php). Examples: @@ -61,12 +61,14 @@ Examples: echo Encoding::fixUTF8($str); // Will break U+2014 echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will preserve U+2014 echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will preserve U+2014 + echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT | Encoding::ICONV_IGNORE); // Will preserve U+2014 will output: Fédération Camerounaise?de?Football Fédération Camerounaise—de—Football Fédération Camerounaise—de—Football + Fédération Camerounaise—de—Football while: @@ -76,12 +78,14 @@ while: echo Encoding::fixUTF8($str); // Will break invalid characters echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will remove invalid characters, keep those present in Win1252 echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will trasliterate invalid characters, keep those present in Win1252 + echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT | Encoding::ICONV_IGNORE); // Will trasliterate invalid (but legal) characters, remove illegal character in input string, keep those present in Win1252 will output: ???????? šž ceeišuuž + ceeišuuž Install via composer: diff --git a/src/ForceUTF8/Encoding.php b/src/ForceUTF8/Encoding.php index 2031592..55326bf 100644 --- a/src/ForceUTF8/Encoding.php +++ b/src/ForceUTF8/Encoding.php @@ -41,9 +41,9 @@ class Encoding { - const ICONV_TRANSLIT = "TRANSLIT"; - const ICONV_IGNORE = "IGNORE"; - const WITHOUT_ICONV = ""; + const ICONV_TRANSLIT = 1; + const ICONV_IGNORE = 2; + const WITHOUT_ICONV = 4; protected static $win1252ToUtf8 = array( 128 => "\xe2\x82\xac", @@ -199,7 +199,7 @@ static function toUTF8($text){ $c3 = $i+2 >= $max? "\x00" : $text[$i+2]; $c4 = $i+3 >= $max? "\x00" : $text[$i+3]; if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already + if($c2 < "\x80"){ //yeah, almost sure it's UTF8 already $buf .= $c1 . $c2; $i++; } else { //not valid UTF8. Convert it. @@ -337,14 +337,24 @@ public static function encode($encodingLabel, $text) return self::toUTF8($text); } - protected static function utf8_decode($text, $option = self::WITHOUT_ICONV) + protected static function utf8_decode($text, $flags = self::WITHOUT_ICONV) { - if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) { + if ($flags & self::WITHOUT_ICONV || !function_exists('iconv')) { $o = utf8_decode( str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) ); } else { - $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text); + $outCharsetParams = ''; + + if ($flags & self::ICONV_TRANSLIT) { + $outCharsetParams .= '//TRANSLIT'; + } + + if ($flags & self::ICONV_IGNORE) { + $outCharsetParams .= '//IGNORE'; + } + + $o = iconv("UTF-8", "Windows-1252" . $outCharsetParams, $text); } return $o; } diff --git a/test/ForceUTF8Test.php b/test/ForceUTF8Test.php index 02ec687..b5639b9 100644 --- a/test/ForceUTF8Test.php +++ b/test/ForceUTF8Test.php @@ -97,5 +97,8 @@ function test_double_encoded_arrays_fix(){ Test::identical("fixUTF8() Example 4 still working.", Encoding::fixUTF8("Fédération Camerounaise de Football\n"), "Fédération Camerounaise de Football\n"); +Test::identical("fixUTF8() Example 5 still working.", + Encoding::fixUTF8("À \n"), + "À \n"); Test::totals();