汉字转拼音(多音字处理)——Java
汉字转拼音需要引入pinyin4j包 示例:
package net.pushi.neo4jdata.util;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @Program:
* @ClassName: PinyinUtil
* @Description: 汉字转拼音
* @Author: hsy
* @CreateDate: 2020/12/7
**/
public class PinyinUtil {
private static Map<String, List<String>> pinyinMap = new HashMap<String, List<String>>();
/**
* @Description: convertInitialToUpperCase,将某个字符串的首字母大写
* @param str: 汉字字符串
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String convertInitialToUpperCase(String str){
if(str==null){
return null;
}
StringBuffer sb = new StringBuffer();
char[] arr = str.toCharArray();
for(int i=0;i<arr.length;i++){
char ch = arr[i];
if(i==0){
sb.append(String.valueOf(ch).toUpperCase());
}else{
sb.append(ch);
}
}
return sb.toString();
}
/**
* @Description: convertChineseToPinyin,汉字转拼音,多音字自定义配置
* @param chinese: 汉字字符串
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String convertChineseToPinyin(String chinese){
String path = System.getProperty("user.dir")+"/duoyinzi_dic.txt";
// System.out.println("多音字字典路径=====》"+path);
initPinyin(path);
StringBuffer pinyin = new StringBuffer();
HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
char[] arr = chinese.toCharArray();
for (int i = 0; i < arr.length; i++){
char ch = arr[i];
if (ch > 128){ //非ASCII码
// 取得当前汉字的所有全拼
try {
String[] results = PinyinHelper.toHanyuPinyinStringArray(ch, defaultFormat);
if (results == null){ //非中文
return "";
}else {
int len = results.length;
if (len == 1){ //不是多音字
// pinyin.append(results[0]);
String py = results[0];
if (py.contains("u:")){ //过滤 u:
py = py.replace("u:", "v");
// System.out.println("filter u:"+py);
}
pinyin.append(convertInitialToUpperCase(py)); //拼音首字母大写
}else if (results[0].equals(results[1])){ //非多音字 有多个音,取第一个
// pinyin.append(results[0]);
pinyin.append(convertInitialToUpperCase(results[0]));
}else { //多音字
// System.out.println("多音字:"+ch);
int length = chinese.length();
boolean flag = false;
String s = null;
List<String> keyList =null;
for (int x = 0; x < len; x++){
String py = results[x];
if (py.contains("u:")){ //过滤 u:
py = py.replace("u:", "v");
// System.out.println("filter u:"+py);
}
keyList = pinyinMap.get(py);
if (i + 3 <= length){ //后向匹配2个汉字
s = chinese.substring(i, i + 3);
if (keyList != null && (keyList.contains(s))){
// System.out.println("last 2 > " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if (i + 2 <= length){ //后向匹配 1个汉字
s = chinese.substring(i, i + 2);
if (keyList != null && (keyList.contains(s))){
// System.out.println("last 1 > " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 2 >= 0) && (i+1<=length)){ // 前向匹配2个汉字
s = chinese.substring(i - 2, i+1);
if (keyList != null && (keyList.contains(s))){
// System.out.println("before 2 < " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 1 >= 0) && (i+1<=length)){ // 前向匹配1个汉字
s = chinese.substring(i - 1, i+1);
if (keyList != null && (keyList.contains(s))){
// System.out.println("before 1 < " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 1 >= 0) && (i+2<=length)){ //前向1个,后向1个
s = chinese.substring(i - 1, i+2);
if (keyList != null && (keyList.contains(s))){
// System.out.println("before last 1 <> " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
}
if (!flag){ //都没有找到,匹配默认的读音
s = String.valueOf(ch);
for (int x = 0; x < len; x++){
String py = results[x];
if (py.contains("u:")){
py = py.replace("u:", "v");
// System.out.println("filter u:");
}
keyList = pinyinMap.get(py);
if (keyList != null && (keyList.contains(s))){
// System.out.println("default = " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
break;
}
}
}
}
}
}catch (BadHanyuPinyinOutputFormatCombination e){
e.printStackTrace();
}
}else {
pinyin.append(arr[i]);
}
}
return pinyin.toString();
}
/**
* @Description: initPinyin,多音字配置
* @param fileName:
* @return: void
* @Author: hsy
* @Date: 2020/12/7
*/
public static void initPinyin(String fileName) {
BufferedReader br = null;
try {
// 读取多音字的全部拼音表;
InputStream file = new FileInputStream(fileName);
// InputStream file = PinyinHelper.class.getResourceAsStream(fileName);
br = new BufferedReader(new InputStreamReader(file));
String s;
while ((s = br.readLine()) != null) {
if (s != null) {
String[] arr = s.split("#");
String pinyin = arr[0];
String chinese = arr[1];
if(chinese!=null){
String[] strs = chinese.split(" ");
List<String> list = Arrays.asList(strs);
pinyinMap.put(pinyin, list);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* @Description: getPinyin,汉字转拼音
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinyin(String paramString) {
return getPinyinZh_CN(convertStringByChinese(paramString));
}
/**
* @Description: getPinyinToUpperCase,汉字转拼音(大写)
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinyinToUpperCase(String paramString) {
return getPinyinZh_CN(convertStringByChinese(paramString)).toUpperCase();
}
/**
* @Description: getPinyinToLowerCase,汉字转拼音(小写)
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinyinToLowerCase(String paramString) {
return getPinyinZh_CN(convertStringByChinese(paramString)).toLowerCase();
}
/**
* @Description: getPinyinFirstToUpperCase,汉字转拼音首字母大写
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinyinFirstToUpperCase(String paramString) {
return getPinyin(paramString);
}
private static HanyuPinyinOutputFormat getDefaultFormat() {
HanyuPinyinOutputFormat localHanyuPinyinOutputFormat = new HanyuPinyinOutputFormat();
localHanyuPinyinOutputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
localHanyuPinyinOutputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
localHanyuPinyinOutputFormat.setVCharType(HanyuPinyinVCharType.WITH_U_AND_COLON);
return localHanyuPinyinOutputFormat;
}
private static Set<String> convertStringByChinese(String paramString) {
char[] arrayOfChar1 = paramString.toCharArray();
if ((paramString != null) && (!paramString.trim().equalsIgnoreCase(""))) {
char[] arrayOfChar2 = paramString.toCharArray();
String[][] arrayOfString = new String[paramString.length()][];
for (int i = 0; i < arrayOfChar2.length; i++) {
char c = arrayOfChar2[i];
if ((String.valueOf(c).matches("[\\u4E00-\\u9FA5]+")) || (String.valueOf(c).matches("[\\u3007]"))) {
try {
arrayOfString[i] = PinyinHelper.toHanyuPinyinStringArray(arrayOfChar1[i], getDefaultFormat());
} catch (BadHanyuPinyinOutputFormatCombination localBadHanyuPinyinOutputFormatCombination) {
localBadHanyuPinyinOutputFormatCombination.printStackTrace();
}
} else {
arrayOfString[i] = new String[] { String.valueOf(arrayOfChar2[i]) };
}
}
String[] arrayOfString1 = exchange(arrayOfString);
HashSet localHashSet = new HashSet();
for (int j = 0; j < arrayOfString1.length; j++) {
localHashSet.add(arrayOfString1[j]);
}
return localHashSet;
}
return null;
}
private static String[] exchange(String[][] paramArrayOfString) {
String[][] arrayOfString = doExchange(paramArrayOfString);
return arrayOfString[0];
}
private static String[][] doExchange(String[][] paramArrayOfString) {
int i = paramArrayOfString.length;
if (i >= 2) {
int j = paramArrayOfString[0].length;
int k = paramArrayOfString[1].length;
int m = j * k;
String[] arrayOfString = new String[m];
int n = 0;
for (int i1 = 0; i1 < j; i1++) {
for (int i2 = 0; i2 < k; i2++) {
arrayOfString[n] = (capitalize(paramArrayOfString[0][i1]) + capitalize(paramArrayOfString[1][i2]));
n++;
}
}
String[][] arrayOfString1 = new String[i - 1][];
for (int i2 = 2; i2 < i; i2++) {
arrayOfString1[(i2 - 1)] = paramArrayOfString[i2];
}
arrayOfString1[0] = arrayOfString;
return doExchange(arrayOfString1);
}
return paramArrayOfString;
}
private static String capitalize(String paramString) {
char[] arrayOfChar = paramString.toCharArray();
if ((arrayOfChar != null) && (arrayOfChar.length > 0) && (arrayOfChar[0] >= 'a') && (arrayOfChar[0] <= 'z')) {
arrayOfChar[0] = ((char) (arrayOfChar[0] - ' '));
}
return new String(arrayOfChar);
}
private static String getPinyinZh_CN(Set<String> paramSet) {
StringBuilder localStringBuilder = new StringBuilder();
int i = 0;
Iterator localIterator = paramSet.iterator();
while (localIterator.hasNext()) {
String str = (String) localIterator.next();
if (i == paramSet.size() - 1) {
localStringBuilder.append(str);
} else {
localStringBuilder.append(str + ",");
}
i++;
}
return localStringBuilder.toString();
}
/**
* @Description: getPinYinHeadChar,汉字转拼音,只返回首字母
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinYinHeadChar(String paramString) {
StringBuffer localStringBuffer = new StringBuffer();
if ((paramString != null) && (!paramString.trim().equalsIgnoreCase(""))) {
for (int i = 0; i < paramString.length(); i++) {
char c = paramString.charAt(i);
String[] arrayOfString = PinyinHelper.toHanyuPinyinStringArray(c);
if (arrayOfString != null) {
localStringBuffer.append(arrayOfString[0].charAt(0));
} else {
localStringBuffer.append(c);
}
}
}
return localStringBuffer.toString();
}
public static String strFilter(String paramString) throws PatternSyntaxException {
String str = "[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?\"]";
Pattern localPattern = Pattern.compile(str);
Matcher localMatcher = localPattern.matcher(paramString);
return localMatcher.replaceAll("").trim();
}
/**
* @Description: getPinYinHeadCharFilter,汉字转拼音,只返回首字母并过滤特殊字符
* @param paramString: 要转拼音的汉字
* @return: java.lang.String
* @Author: hsy
* @Date: 2020/12/7
*/
public static String getPinYinHeadCharFilter(String paramString) {
return strFilter(getPinYinHeadChar(paramString));
}
public static void main(String[] paramArrayOfString) {
String path = System.getProperty("user.dir")+"/duoyinzi_dic.txt";
System.out.println("路径=====》"+path);
initPinyin(path);
String pinyin = convertChineseToPinyin("捋捋关系");
System.out.println("pinyin=====>"+pinyin);
Logger localLogger = LoggerFactory.getLogger(PinyinUtil.class);
// String str = "〇的输¥¥#s,ldsa";
String str = "捋捋关系";
localLogger.info("小写输出:" + getPinyinToLowerCase(str));
localLogger.info("大写输出:" + getPinyinToUpperCase(str));
localLogger.info("首字母大写输出:" + getPinyinFirstToUpperCase(str));
localLogger.info("返回中文的首字母输出:" + getPinYinHeadChar(str));
localLogger.info("返回中文的首字母并过滤特殊字符输出:" + getPinYinHeadCharFilter(str));
}
}