Удзельнік:Alexey/канвэртар/LanguageBe tarask.php

Зьвесткі зь Вікіпэдыі — вольнай энцыкляпэдыі
<?php
/** Belarusian language, classic spelling
  * (Беларуская, клясычны)
  *
  * @package MediaWiki
  * @subpackage Language
  */
 
require_once( dirname(__FILE__).'/../LanguageConverter.php' );
require_once( dirname(__FILE__).'/LanguageBe_tarask_cyrl.php' );


define('BE_CYR_UPPERCASE', 'АБВГҐДЖЗЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ');
define('BE_CYR_LOWERCASE', 'абвгґджзеёжзійклмнопрстуўфхцчшыьэюя');
define('BE_LAT_UPPERCASE', 'ABCĆČDEFGHIJKLŁMNŃOPRSŚŠTUŬVYZŹŽ');
define('BE_LAT_LOWERCASE', 'abcćčdefghijklłmnńoprsśštuŭvyzźž');

class BeTaraskConverter extends LanguageConverter {

	var $cyr2lat = array();
	var $lat2cyr = array();
 
        function loadDefaultTables() {
                $this->cyr2lat = array(
                	## ЕЁЮЯ перад галоснымі
			'/([АЕЁІОУЫЭЮЯ])Е/u' => '$1JE',	'/([аеёіоуыэюя])е/u' => '$1je',
			'/([АЕЁІОУЫЭЮЯ])Ё/u' => '$1JO',	'/([аеёіоуыэюя])ё/u' => '$1jo',
			'/([АЕЁІОУЫЭЮЯ])Ю/u' => '$1JU',	'/([аеёіоуыэюя])ю/u' => '$1ju',
			'/([АЕЁІОУЫЭЮЯ])Я/u' => '$1JA',	'/([аеёіоуыэюя])я/u' => '$1ja',
			## ЕЁЮЯ пасьля апострафу і Ь
			'/([\'’ь])е/u' => '$1je', '/([\'’ь])ё/u' => '$1jo', 
			'/([\'’ь])ю/u' => '$1ju', '/([\'’ь])я/u' => '$1ja',
			'/ЬЕ/u' => 'ЬJE', '/ЬЁ/u' => 'ЬJO', '/ЬЮ/u' => 'ЬJU', '/ЬЯ/u' => 'ЬJA',
			## ЕЁЮЯ ў пачатку // Дапрацаваць
			'/^Е(['.BE_CYR_LOWERCASE.']|$)/u' => 'Je$1', 
			'/^Е(['.BE_CYR_UPPERCASE.']|$)/u' => 'JE$1',
			'/^Ё(['.BE_CYR_LOWERCASE.']|$)/u' => 'Jo$1', 
			'/^Ё(['.BE_CYR_UPPERCASE.']|$)/u' => 'JO$1',
			'/^Ю(['.BE_CYR_LOWERCASE.']|$)/u' => 'Ju$1', 
			'/^Ю(['.BE_CYR_UPPERCASE.']|$)/u' => 'JU$1',
			'/^Я(['.BE_CYR_LOWERCASE.']|$)/u' => 'Ja$1', 
			'/^Я(['.BE_CYR_UPPERCASE.']|$)/u' => 'JA$1',
			'/^е/u' => 'je', '/^ё/u' => 'jo', '/^ю/u' => 'ju', '/^я/u' => 'ja',
			'/^Е$/u' => 'Je', '/^Ё$/u' => 'Jo', '/^Ю$/u' => 'Ju', '/^Я$/u' => 'Ja',
			## ЕЁЮЯ перад Л
			'/Ля/u' => 'La', '/ля/u' => 'la', '/ЛЯ/u' => 'LA',
			'/Лю/u' => 'Lu', '/ЛЮ/u' => 'LU', '/лю/u' => 'lu',
			'/Лё/u' => 'Lo', '/ЛЁ/u' => 'LO', '/лё/u' => 'lo',
			'/Ле/u' => 'Le', '/ЛЕ/u' => 'LE', '/ле/u' => 'le',
			## ЕЁЮЯ пасьля зычных акрамя Л
			'/е/u' => 'ie',	'/ё/u' => 'io',	'/ю/u' => 'iu',	'/я/u' => 'ia',
			'/Е/u' => 'IE',	'/Ё/u' => 'IO',	'/Ю/u' => 'IU',	'/Я/u' => 'IA',
			## Л
			'/Л([АОУЫЭаоуыэ])/u' => 'Ł$1', '/Л([Iі])/u' => 'L$1', '/Л[Ьь]/u' => 'L',
			'/л([аоуыэ])/u' => 'ł$1', '/лі/u' => 'li', '/ль/u' => 'l',
			## Ьь
			'/З[Ьь]/u' => 'Ź', '/Н[Ьь]/u' => 'Ń', '/С[Ьь]/u' => 'Ś', '/Ц[Ьь]/u' => 'Ć',
			'/зь/u'    => 'ź', '/нь/u'     => 'ń', '/сь/u'    => 'ś', '/ць/u'    => 'ć',
			## Х // FIXME: зьлева можа быць лацінка
			'/Х(['.BE_CYR_UPPERCASE.'])/u' => 'CH$1', '/Х(['.BE_CYR_LOWERCASE.'])/u' => 'Ch$1',
			'/Х$/u' => 'CH', '/х/u' => 'ch',
			## астатнія галосныя (АІОУЫЭ)
			'/а/u' => 'a', '/А/u' => 'A',	'/і/u' => 'i', '/І/u' => 'I',
			'/о/u' => 'o', '/О/u' => 'O',	'/у/u' => 'u', '/У/u' => 'U',
			'/ы/u' => 'y', '/Ы/u' => 'Y',	'/э/u' => 'e', '/Э/u' => 'E',
			## астатнія зычныя
			'/Б/u' => 'B', '/б/u' => 'b',	'/В/u' => 'V', '/в/u' => 'v',
			'/Г/u' => 'H', '/г/u' => 'h',	'/Ґ/u' => 'G', '/ґ/u' => 'g',
			'/Д/u' => 'D', '/д/u' => 'd',	'/Ж/u' => 'Ž', '/ж/u' => 'ž',
			'/З/u' => 'Z', '/з/u' => 'z',	'/Й/u' => 'J', '/й/u' => 'j',
			'/К/u' => 'K', '/к/u' => 'k',	'/Л/u' => 'Ł', '/л/u' => 'ł',
			'/М/u' => 'M', '/м/u' => 'm',	'/Н/u' => 'N', '/н/u' => 'n',
			'/П/u' => 'P', '/п/u' => 'p',	'/Р/u' => 'R', '/р/u' => 'r',
			'/С/u' => 'S', '/с/u' => 's',	'/Т/u' => 'T', '/т/u' => 't',
			'/Ў/u' => 'Ŭ', '/ў/u' => 'ŭ',	'/Ф/u' => 'F', '/ф/u' => 'f',
			'/Ц/u' => 'C', '/ц/u' => 'c',	'/Ч/u' => 'Č', '/ч/u' => 'č',
			'/Ш/u' => 'Š', '/ш/u' => 'š',
			## Выдаляем апострафы і мяккія знакі
			'/[ьЬ\'’]/u' => '',
		);
		
		$this->lat2cyr = array(
			## апостраф // праверыць!
			'/([bBdDvVmMpPfFtTrRgGhHkKžŽčČšŠCcNnSsZz])([Jj][eEoOuUaA])/u' => '$1\'$2',
			## ĆŃŚŹ // дапрацаваць
			
			'/Ć(['.BE_LAT_LOWERCASE.'])/u' => 'Ць$1', '/Ć/u' => 'ЦЬ',
			'/Ń(['.BE_LAT_LOWERCASE.'])/u' => 'Нь$1', '/Ń/u' => 'НЬ',
			'/Ś(['.BE_LAT_LOWERCASE.'])/u' => 'Сь$1', '/Ś/u' => 'СЬ',
			'/Ź(['.BE_LAT_LOWERCASE.'])/u' => 'Зь$1', '/Ź/u' => 'ЗЬ',
			'/^Ć$/u' => 'Ць', '/^Ń$/u' => 'Нь', '/^Ś$/u' => 'Сь', '/^Ź$/u' => 'зь',
			'/ć/u' => 'ць', '/ń/u' => 'нь', '/ś/u' => 'сь', '/ź/u' => 'зь',
			## ŁL // дапрацаваць
			'/Ł/u' => 'Л', '/La/u' => 'Ля', '/LA/u' => 'ЛЯ',
			'/Le/u' => 'Ле', '/LE/u' => 'ЛЕ', '/Li/u' => 'Лі', '/LI/u' => 'ЛІ',
			'/Lo/u' => 'Лё', '/LO/u' => 'ЛЁ', '/Lu/u' => 'Лю', '/LU/u' => 'ЛЮ',
			'/L(['.BE_LAT_LOWERCASE.'])/u' => 'Ль$1', '/L/u' => 'ЛЬ',
			'/ł/u' => 'л', '/la/u' => 'ля', '/le/u' => 'ле', 
			'/li/u' => 'лі', '/lo/u' => 'лё', '/lu/u' => 'лю' , '/l/u' => 'ль',
			## Ch
			'/C[Hh]/u' => 'Х', '/ch/u' => 'х',
			## галосныя Je Jo Ju Ja ...
			'/[ij]e/u'    => 'е', '/[ij]o/u'    => 'ё', 
			'/[ij]u/u'    => 'ю', '/[ij]a/u'    => 'я',
			'/[IJ][Ee]/u' => 'Е', '/[IJ][Oo]/u' => 'Ё', 
			'/[IJ][Uu]/u' => 'Ю', '/[IJ][Aa]/u' => 'Я',
			## галосныя AEIOUY
			'/A/u' => 'А', '/E/u' => 'Э', '/I/u' => 'І', '/O/u' => 'О', 
			'/U/u' => 'У', '/Y/u' => 'Ы', '/a/u' => 'а', '/e/u' => 'э', 
			'/i/u' => 'і', '/o/u' => 'о', '/u/u' => 'у', '/y/u' => 'ы',
			## астатнія зычныя
			'/B/u' => 'Б', '/b/u' => 'б', '/C/u' => 'Ц', '/c/u' => 'ц', 
			'/Č/u' => 'Ч', '/č/u' => 'ч', '/D/u' => 'Д', '/d/u' => 'д', 
			'/F/u' => 'Ф', '/f/u' => 'ф', '/G/u' => 'Ґ', '/g/u' => 'ґ',
			'/H/u' => 'Г', '/h/u' => 'г', '/J/u' => 'Й', '/j/u' => 'й',
			'/K/u' => 'К', '/k/u' => 'к', '/M/u' => 'М', '/m/u' => 'м', 
			'/N/u' => 'Н', '/n/u' => 'н', '/P/u' => 'П', '/p/u' => 'п', 
			'/R/u' => 'Р', '/r/u' => 'р', '/S/u' => 'С', '/s/u' => 'с', 
			'/Š/u' => 'Ш', '/š/u' => 'ш', '/T/u' => 'Т', '/t/u' => 'т', 
			'/Ŭ/u' => 'Ў', '/ŭ/u' => 'ў', '/V/u' => 'В', '/v/u' => 'в',
			'/Z/u' => 'З', '/z/u' => 'з', '/Ž/u' => 'Ж', '/ž/u' => 'ж',
		);
		
		$BeTarask2Cyrl = array();
                $BeTarask2Latn = array();

                $this->mTables = array(
                        'be-latn'   => new ReplacementArray( $BeTarask2Latn ),
                        'be-tarask' => new ReplacementArray( $BeTarask2Cyrl )
                );
        }
 
        function regsConverter( $text, $toVariant ) {
        	if ($text == '') return $text;
                if ($toVariant == 'be-tarask'){
                        foreach($this->lat2cyr as $pat => $rep) {
                        	$text = preg_replace($pat, $rep, $text);
                        }
                        return $text;
                }
                if ($toVariant == 'be-latn'){
                        foreach($this->cyr2lat as $pat => $rep) {
                        	$text = preg_replace($pat, $rep, $text);
                        }
                        return $text;
                }
                return $text;
        }

	// Do not convert content on talk pages
	function parserConvert( $text, &$parser ){
		if(is_object($parser->getTitle() ) && $parser->getTitle()->isTalkPage())
			$this->mDoContentConvert=false;
		else 
			$this->mDoContentConvert=true;

		return parent::parserConvert($text, $parser );
	}
 
	/*
	 * A function wrapper:
	 *   - if there is no selected variant, leave the link 
	 *     names as they were
	 *   - do not try to find variants for usernames
	 */
	function findVariantLink( &$link, &$nt ) {
		// check for user namespace
		if(is_object($nt)){
			$ns = $nt->getNamespace();
			if($ns==NS_USER || $ns==NS_USER_TALK)
				return;
		}

		$oldlink=$link;
		parent::findVariantLink($link,$nt);
		if($this->getPreferredVariant()==$this->mMainLanguageCode)
			$link=$oldlink;
	}
 
        /*
         * We want our external link captions to be converted in variants,
         * so we return the original text instead -{$text}-, except for URLs
         */
        function markNoConversion($text, $noParse=false) {
                if($noParse || preg_match("/^https?:\/\/|ftp:\/\/|irc:\/\//",$text))
                        return parent::markNoConversion($text);
                return $text;
        }
 
        /*
         * An ugly function wrapper for parsing Image titles
         * (to prevent image name conversion)
         */
        function autoConvert($text, $toVariant=false) {
                global $wgTitle;
                if($wgTitle->getNameSpace()==NS_IMAGE){
                        $imagename = $wgTitle->getNsText();
                        if(preg_match("/^$imagename:/",$text)) return $text;
                }
                if ($this->getPreferredVariant() == $this->mMainLanguageCode) return $text; // ???
                return parent::autoConvert($text,$toVariant);
        }
 
        /**
         *  It translates text into variant
         */
        function translate($text, $toVariant){
        	$letters = '';
        	switch($toVariant) {
        	case 'be-latn':
        		$letters = BE_CYR_UPPERCASE.BE_CYR_LOWERCASE."'’";
        		break;
        		
        	case 'be-tarask':
        		$letters = BE_LAT_UPPERCASE.BE_LAT_LOWERCASE;
        		break;
        		
        	default:
        		return $text;
        	}
        	$text = parent::translate($text, $toVariant);
       		$matches = preg_split('/[^'.$letters.']+/u',$text,-1,PREG_SPLIT_OFFSET_CAPTURE);
                $mstart = 0;
                foreach($matches as $m) {
                        $ret .= substr($text, $mstart, $m[1]-$mstart);
                        $ret .= $this->regsConverter($m[0],$toVariant);
                        $mstart = $m[1] + strlen($m[0]);
                }
		return $ret;
        }
 
}
 
class LanguageBe_tarask extends LanguageBe_tarask_cyrl {
 
        function __construct() {
                global $wgHooks;
                parent::__construct();
 
                $variants = array( 'be-tarask', 'be-latn');
                $variantfallbacks = array(
                        'be-tarask' => 'be-latn',
                        'be-latn'   => 'be-tarask',
                );
 
                $this->mConverter = new BeTaraskConverter( $this, 'be-tarask', $variants, $variantfallbacks );
                $wgHooks['ArticleSaveComplete'][] = $this->mConverter;
        }
 
        function convertGrammar( $word, $case ) {
                $fname="LanguageBeTarask::convertGrammar";
                wfProfileIn( $fname );

                //always convert to -tarask before convertGrammar
                $w1 = $word;
                $word = $this->mConverter->autoConvert( $word, 'be-tarask' );
                $w2 = $word;
                $case = $this->mConverter->autoConvert( $case, 'be-tarask' );
                $word = parent::convertGrammar( $word, $case );
                //restore encoding
                if( $w1 != $w2 ) {
                        $word = $this->mConverter->autoConvert( $word, 'be-latn' );
                }
                wfProfileOut( $fname );
                return $word;
        }
        
	function stripForSearch( $string ) {
		$t = $this->mConverter->autoConvert($string, 'be-tarask');
		$t = parent::stripForSearch( $t );
		return $t;
	}        
        

	function convertForSearchResult( $termsArray ) {
        	$terms = implode( '|', $termsArray );
	        $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
        	$ret = array_unique( explode('|', $terms) );
	        return $ret;
	}
}
 
?>