源
发布于

汉字转拼音(多音字处理)——Java

汉字转拼音需要引入pinyin4j包 示例:

package net.pushi.neo4jdata.util;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @Program:
 * @ClassName: PinyinUtil
 * @Description: 汉字转拼音
 * @Author: hsy
 * @CreateDate: 2020/12/7
 **/
public class PinyinUtil {

    private static Map<String, List<String>> pinyinMap = new HashMap<String, List<String>>();

    /**
     * @Description: convertInitialToUpperCase,将某个字符串的首字母大写
     * @param str: 汉字字符串
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String convertInitialToUpperCase(String str){
        if(str==null){
            return null;
        }
        StringBuffer sb = new StringBuffer();
        char[] arr = str.toCharArray();
        for(int i=0;i<arr.length;i++){
            char ch = arr[i];
            if(i==0){
                sb.append(String.valueOf(ch).toUpperCase());
            }else{
                sb.append(ch);
            }
        }
        return sb.toString();
    }

    /**
     * @Description: convertChineseToPinyin,汉字转拼音,多音字自定义配置
     * @param chinese: 汉字字符串
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String convertChineseToPinyin(String chinese){
        String path = System.getProperty("user.dir")+"/duoyinzi_dic.txt";
//        System.out.println("多音字字典路径=====》"+path);
        initPinyin(path);
        StringBuffer pinyin = new StringBuffer();
        HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
        defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
        defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        char[] arr = chinese.toCharArray();
        for (int i = 0; i < arr.length; i++){
            char ch = arr[i];
            if (ch > 128){      //非ASCII码
                // 取得当前汉字的所有全拼
                try {
                    String[] results = PinyinHelper.toHanyuPinyinStringArray(ch, defaultFormat);
                    if (results == null){       //非中文
                        return "";
                    }else {
                        int len = results.length;
                        if (len == 1){      //不是多音字
//                            pinyin.append(results[0]);
                            String py = results[0];
                            if (py.contains("u:")){     //过滤 u:
                                py = py.replace("u:", "v");
//                                System.out.println("filter u:"+py);
                            }
                            pinyin.append(convertInitialToUpperCase(py));     //拼音首字母大写
                        }else if (results[0].equals(results[1])){       //非多音字 有多个音,取第一个
//                            pinyin.append(results[0]);
                            pinyin.append(convertInitialToUpperCase(results[0]));
                        }else {     //多音字
//                            System.out.println("多音字:"+ch);
                            int length = chinese.length();
                            boolean flag = false;
                            String s = null;
                            List<String> keyList =null;
                            for (int x = 0; x < len; x++){
                                String py = results[x];
                                if (py.contains("u:")){     //过滤 u:
                                    py = py.replace("u:", "v");
//                                    System.out.println("filter u:"+py);
                                }
                                keyList = pinyinMap.get(py);
                                if (i + 3 <= length){       //后向匹配2个汉字
                                    s = chinese.substring(i, i + 3);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("last 2 > " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        flag = true;
                                        break;
                                    }
                                }
                                if (i + 2 <= length){       //后向匹配 1个汉字
                                    s = chinese.substring(i, i + 2);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("last 1 > " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        flag = true;
                                        break;
                                    }
                                }
                                if ((i - 2 >= 0) && (i+1<=length)){     // 前向匹配2个汉字
                                    s = chinese.substring(i - 2, i+1);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("before 2 < " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        flag = true;
                                        break;
                                    }
                                }
                                if ((i - 1 >= 0) && (i+1<=length)){     // 前向匹配1个汉字
                                    s = chinese.substring(i - 1, i+1);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("before 1 < " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        flag = true;
                                        break;
                                    }
                                }
                                if ((i - 1 >= 0) && (i+2<=length)){     //前向1个,后向1个
                                    s = chinese.substring(i - 1, i+2);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("before last 1 <> " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        flag = true;
                                        break;
                                    }
                                }
                            }
                            if (!flag){     //都没有找到,匹配默认的读音
                                s = String.valueOf(ch);
                                for (int x = 0; x < len; x++){
                                    String py = results[x];
                                    if (py.contains("u:")){
                                        py = py.replace("u:", "v");
//                                        System.out.println("filter u:");
                                    }
                                    keyList = pinyinMap.get(py);
                                    if (keyList != null && (keyList.contains(s))){
//                                        System.out.println("default = " + py);
//                                        pinyin.append(results[x]);
                                        pinyin.append(convertInitialToUpperCase(py));
                                        break;
                                    }
                                }
                            }
                        }
                    }
                }catch (BadHanyuPinyinOutputFormatCombination e){
                    e.printStackTrace();
                }
            }else {
                pinyin.append(arr[i]);
            }
        }
        return pinyin.toString();
    }

    /**
     * @Description: initPinyin,多音字配置
     * @param fileName: 
     * @return: void
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static void initPinyin(String fileName) {
        BufferedReader br = null;
        try {
            // 读取多音字的全部拼音表;
            InputStream file = new FileInputStream(fileName);
//             InputStream file = PinyinHelper.class.getResourceAsStream(fileName);
            br = new BufferedReader(new InputStreamReader(file));
            String s;
            while ((s = br.readLine()) != null) {
                if (s != null) {
                    String[] arr = s.split("#");
                    String pinyin = arr[0];
                    String chinese = arr[1];
                    if(chinese!=null){
                        String[] strs = chinese.split(" ");
                        List<String> list = Arrays.asList(strs);
                        pinyinMap.put(pinyin, list);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally{
            try {
                br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * @Description: getPinyin,汉字转拼音
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinyin(String paramString) {
        return getPinyinZh_CN(convertStringByChinese(paramString));
    }

    /**
     * @Description: getPinyinToUpperCase,汉字转拼音(大写)
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinyinToUpperCase(String paramString) {
        return getPinyinZh_CN(convertStringByChinese(paramString)).toUpperCase();
    }

    /**
     * @Description: getPinyinToLowerCase,汉字转拼音(小写)
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinyinToLowerCase(String paramString) {
        return getPinyinZh_CN(convertStringByChinese(paramString)).toLowerCase();
    }

    /**
     * @Description: getPinyinFirstToUpperCase,汉字转拼音首字母大写
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinyinFirstToUpperCase(String paramString) {
        return getPinyin(paramString);
    }

    private static HanyuPinyinOutputFormat getDefaultFormat() {
        HanyuPinyinOutputFormat localHanyuPinyinOutputFormat = new HanyuPinyinOutputFormat();
        localHanyuPinyinOutputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
        localHanyuPinyinOutputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        localHanyuPinyinOutputFormat.setVCharType(HanyuPinyinVCharType.WITH_U_AND_COLON);
        return localHanyuPinyinOutputFormat;
    }

    private static Set<String> convertStringByChinese(String paramString) {
        char[] arrayOfChar1 = paramString.toCharArray();
        if ((paramString != null) && (!paramString.trim().equalsIgnoreCase(""))) {
            char[] arrayOfChar2 = paramString.toCharArray();
            String[][] arrayOfString = new String[paramString.length()][];
            for (int i = 0; i < arrayOfChar2.length; i++) {
                char c = arrayOfChar2[i];
                if ((String.valueOf(c).matches("[\\u4E00-\\u9FA5]+")) || (String.valueOf(c).matches("[\\u3007]"))) {
                    try {
                        arrayOfString[i] = PinyinHelper.toHanyuPinyinStringArray(arrayOfChar1[i], getDefaultFormat());
                    } catch (BadHanyuPinyinOutputFormatCombination localBadHanyuPinyinOutputFormatCombination) {
                        localBadHanyuPinyinOutputFormatCombination.printStackTrace();
                    }
                } else {
                    arrayOfString[i] = new String[] { String.valueOf(arrayOfChar2[i]) };
                }
            }
            String[] arrayOfString1 = exchange(arrayOfString);
            HashSet localHashSet = new HashSet();
            for (int j = 0; j < arrayOfString1.length; j++) {
                localHashSet.add(arrayOfString1[j]);
            }
            return localHashSet;
        }
        return null;
    }

    private static String[] exchange(String[][] paramArrayOfString) {
        String[][] arrayOfString = doExchange(paramArrayOfString);
        return arrayOfString[0];
    }

    private static String[][] doExchange(String[][] paramArrayOfString) {
        int i = paramArrayOfString.length;
        if (i >= 2) {
            int j = paramArrayOfString[0].length;
            int k = paramArrayOfString[1].length;
            int m = j * k;
            String[] arrayOfString = new String[m];
            int n = 0;
            for (int i1 = 0; i1 < j; i1++) {
                for (int i2 = 0; i2 < k; i2++) {
                    arrayOfString[n] = (capitalize(paramArrayOfString[0][i1]) + capitalize(paramArrayOfString[1][i2]));
                    n++;
                }
            }
            String[][] arrayOfString1 = new String[i - 1][];
            for (int i2 = 2; i2 < i; i2++) {
                arrayOfString1[(i2 - 1)] = paramArrayOfString[i2];
            }
            arrayOfString1[0] = arrayOfString;
            return doExchange(arrayOfString1);
        }
        return paramArrayOfString;
    }

    private static String capitalize(String paramString) {
        char[] arrayOfChar = paramString.toCharArray();
        if ((arrayOfChar != null) && (arrayOfChar.length > 0) && (arrayOfChar[0] >= 'a') && (arrayOfChar[0] <= 'z')) {
            arrayOfChar[0] = ((char) (arrayOfChar[0] - ' '));
        }
        return new String(arrayOfChar);
    }

    private static String getPinyinZh_CN(Set<String> paramSet) {
        StringBuilder localStringBuilder = new StringBuilder();
        int i = 0;
        Iterator localIterator = paramSet.iterator();
        while (localIterator.hasNext()) {
            String str = (String) localIterator.next();
            if (i == paramSet.size() - 1) {
                localStringBuilder.append(str);
            } else {
                localStringBuilder.append(str + ",");
            }
            i++;
        }
        return localStringBuilder.toString();
    }

    /**
     * @Description: getPinYinHeadChar,汉字转拼音,只返回首字母
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinYinHeadChar(String paramString) {
        StringBuffer localStringBuffer = new StringBuffer();
        if ((paramString != null) && (!paramString.trim().equalsIgnoreCase(""))) {
            for (int i = 0; i < paramString.length(); i++) {
                char c = paramString.charAt(i);
                String[] arrayOfString = PinyinHelper.toHanyuPinyinStringArray(c);
                if (arrayOfString != null) {
                    localStringBuffer.append(arrayOfString[0].charAt(0));
                } else {
                    localStringBuffer.append(c);
                }
            }
        }
        return localStringBuffer.toString();
    }

    public static String strFilter(String paramString) throws PatternSyntaxException {
        String str = "[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?\"]";
        Pattern localPattern = Pattern.compile(str);
        Matcher localMatcher = localPattern.matcher(paramString);
        return localMatcher.replaceAll("").trim();
    }

    /**
     * @Description: getPinYinHeadCharFilter,汉字转拼音,只返回首字母并过滤特殊字符
     * @param paramString: 要转拼音的汉字
     * @return: java.lang.String
     * @Author: hsy
     * @Date: 2020/12/7
     */
    public static String getPinYinHeadCharFilter(String paramString) {
        return strFilter(getPinYinHeadChar(paramString));
    }

    public static void main(String[] paramArrayOfString) {
        String path = System.getProperty("user.dir")+"/duoyinzi_dic.txt";
        System.out.println("路径=====》"+path);
        initPinyin(path);
        String pinyin = convertChineseToPinyin("捋捋关系");
        System.out.println("pinyin=====>"+pinyin);

        Logger localLogger = LoggerFactory.getLogger(PinyinUtil.class);
//        String str = "〇的输¥¥#s,ldsa";
        String str = "捋捋关系";
        localLogger.info("小写输出:" + getPinyinToLowerCase(str));
        localLogger.info("大写输出:" + getPinyinToUpperCase(str));
        localLogger.info("首字母大写输出:" + getPinyinFirstToUpperCase(str));
        localLogger.info("返回中文的首字母输出:" + getPinYinHeadChar(str));
        localLogger.info("返回中文的首字母并过滤特殊字符输出:" + getPinYinHeadCharFilter(str));
    }
}

评论