博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
solr入门之pinyin4j源代码改写动态加入扩展词及整合进war项目中
阅读量:6677 次
发布时间:2019-06-25

本文共 17880 字,大约阅读时间需要 59 分钟。

1.初始化时载入用户定义的字典package net.sourceforge.pinyin4j;import net.sourceforge.pinyin4j.multipinyin.Trie;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import com.gome.mx.plus.pinyin.ext.PYWriterUtils;/** * Manage all external resources required in PinyinHelper class. * * @author Li Min (xmlerlimin@gmail.com) */public class ChineseToPinyinResource {    /**     * A hash table contains 
pairs */ private Trie unicodeToHanyuPinyinTable = null; /** * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set. */ private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) { this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable; } /** * @return Returns the unicodeToHanyuPinyinTable. */ public Trie getUnicodeToHanyuPinyinTable() { return unicodeToHanyuPinyinTable; } /** * Private constructor as part of the singleton pattern. */ private ChineseToPinyinResource() { initializeResource(); } /** * Initialize a hash-table contains
pairs */ private void initializeResource() { try { final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt"; final String resourceMultiName = "/pinyindb/multi_pinyin.txt"; final String userResourceName = PYWriterUtils.getPath(); setUnicodeToHanyuPinyinTable(new Trie()); getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName)); getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName)); getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend(); //载入用户自己定义词库 if (userResourceName != null) { File userMultiPinyinFile = new File(userResourceName); FileInputStream is = new FileInputStream(userMultiPinyinFile); getUnicodeToHanyuPinyinTable().load(is); } } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } } Trie getHanyuPinyinTrie(char ch) { String codepointHexStr = Integer.toHexString((int) ch).toUpperCase(); // fetch from hashtable return getUnicodeToHanyuPinyinTable().get(codepointHexStr); } /** * Get the unformatted Hanyu Pinyin representations of the given Chinese * character in array format. * * @param ch given Chinese character in Unicode * @return The Hanyu Pinyin strings of the given Chinese character in array * format; return null if there is no corresponding Pinyin string. */ String[] getHanyuPinyinStringArray(char ch) { String pinyinRecord = getHanyuPinyinRecordFromChar(ch); return parsePinyinString(pinyinRecord); } String[] parsePinyinString(String pinyinRecord) { if (null != pinyinRecord) { int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET); int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET); String stripedString = pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(), indexOfRightBracket); return stripedString.split(Field.COMMA); } else return null; // no record found or mal-formatted record } /** * @param record given record string of Hanyu Pinyin * @return return true if record is not null and record is not "none0" and * record is not mal-formatted, else return false */ private boolean isValidRecord(String record) { final String noneStr = "(none0)"; return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET) && record.endsWith(Field.RIGHT_BRACKET); } /** * @param ch given Chinese character in Unicode * @return corresponding Hanyu Pinyin Record in Properties file; null if no * record found */ private String getHanyuPinyinRecordFromChar(char ch) { // convert Chinese character to code point (integer) // please refer to http://www.unicode.org/glossary/#code_point // Another reference: http://en.wikipedia.org/wiki/Unicode int codePointOfChar = ch; String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase(); // fetch from hashtable Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr); String foundRecord = null; if (trie != null) foundRecord = trie.getPinyin(); return isValidRecord(foundRecord) ?

foundRecord : null; } /** * Singleton factory method. * * @return the one and only MySingleton. */ public static ChineseToPinyinResource getInstance() { return ChineseToPinyinResourceHolder.theInstance; } /** * Singleton implementation helper. */ private static class ChineseToPinyinResourceHolder { static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource(); } /** * A class encloses common string constants used in Properties files * * @author Li Min (xmlerlimin@gmail.com) */ class Field { static final String LEFT_BRACKET = "("; static final String RIGHT_BRACKET = ")"; static final String COMMA = ","; } }

批量写入功能加入package com.gome.mx.plus.pinyin.ext;import java.io.BufferedWriter;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStreamWriter;import java.util.HashSet;import java.util.Map;import java.util.Map.Entry;import java.util.Set;import javax.xml.crypto.dsig.spec.ExcC14NParameterSpec;import net.sourceforge.pinyin4j.ChineseToPinyinResource;import net.sourceforge.pinyin4j.ResourceHelper;import net.sourceforge.pinyin4j.multipinyin.MultiPinyinConfig;import net.sourceforge.pinyin4j.multipinyin.Trie;/** * 将汉语和拼音写入指定的文件里--文件位置能够指定 * 而且能够动态的载入  不须要重新启动服务 * 还能指定是否又一次写 还是追加的方式 * 还能够将原来已经存在的拼音合并过来--能够指定 * @author songqinghu * */public class PYWriterUtils {    //这里改为系统的绝对路径    private static String path;    private static boolean flag = true;//能够设置文件位置    /**     * @描写叙述:获取配置文件的位置 ---仅仅能设置一次     * @return void     * @exception     * @createTime:2016年4月6日     * @author: songqinghu     */    public static void setPath(String path){        if(flag){            PYWriterUtils.path = path;            flag = false;//仅仅能设置 一次        }    }    public static String getPath(){        return PYWriterUtils.path;    }    private static Class pathClass = PYWriterUtils.class;    /**     *      * @描写叙述:默认写入的方式  设置为追加模式  合并已经存在的拼音为一个     * @param word  汉字     * @param pinyin 拼音     * @param voice  声调     * @return     * @return boolean  是否成功     * @exception     * @createTime:2016年4月6日     * @author: songqinghu     * @throws Exception      */    public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{        return writerControler(word, pinyin, voice, true, true);    }    /**     *      * @描写叙述:能够设置的写入方式  --这里还要添加一个批量写入的功能  本方法仅仅是处理一个汉字     * @param word  汉字     * @param pinyin 拼音     * @param voice  声调     * @param additional 是否追加到文件后     * @param merge 是否合并已经出现的拼音到文件里     * @return     * @return boolean     * @exception     * @createTime:2016年4月6日     * @author: songqinghu     * @throws Exception      * 龦     */    public static boolean writerControler(String word,String pinyin,Integer voice,            boolean additional ,boolean merge) throws Exception{        String path = PYWriterUtils.path;        if (path != null) {            File userMultiPinyinFile = new File(path);            if (userMultiPinyinFile.exists()) {                //获取                BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional)));                //加入音调                pinyin = pinyin + voice;                //写入--16进制  查询 --                if(word !=null && word.length()>0){                    char c = word.toCharArray()[0];                    if(c>128){//是汉字                        String unicode = Integer.toHexString(c).toUpperCase();//编码                        if(merge){//假设要合并                            Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();                            if(trie.get(unicode)!=null){ //存在了编码和拼音相应关系---这里最好在推断一次是否存在了该拼音                                String before = trie.get(unicode).getPinyin();                                before = before.trim().substring(1, before.trim().length()-1);//去除()                                //存在了 就不加入进去了                                boolean flag = false;                                String[] words = before.split(",");                                for (String str : words) {                                    if(str.equals(pinyin)){                                        flag = true; //存在该拼音                                        break;                                    }                                }                               if(flag){                                   pinyin = before;                               }else{                                   pinyin = before +Field.COMMA+ pinyin ;                               }                            }                            //不存在  不须要改变pinyin                        }                        pinyin = addSymbol(pinyin);                        writer.write(unicode+Field.SPACE+pinyin);                        writer.newLine();                    }                }                writer.flush();                writer.close();                //写入完毕  更新词库                reloadText();                return true;            }        }else{            throw new Exception("找不到用户扩展字典");        }       return false;    }    /**     * 完毕批量加入的功能     */    /**     *      * @描写叙述:批量加入汉字和拼音的映射关系到自己定义词库中----这里有个问题 当 批量输入一个多音字 拼音都是map中同一个key时仅仅能提交成功一个--建议提交两次     * @param contents  汉字  拼音  音调  这里一个汉字  能够输入多个拼音了     * @param additional 是否追加到文件后     * @param merge 是否合并已经出现的拼音到文件里     * @return     * @return boolean     * @exception     * @createTime:2016年4月7日     * @author: songqinghu     */    public static boolean writerBatch(Map
> contents,boolean additional ,boolean merge){ //载入文件部分 BufferedWriter writer =null; try { if (path != null) { File userMultiPinyinFile = new File(path); if (userMultiPinyinFile.exists()) { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional))); //写入处理部分 Set
>> entrySet = contents.entrySet(); for (Entry
> entry : entrySet) { String word = entry.getKey().trim();//汉语 String pinyin = ""; for (Entry
content : entry.getValue().entrySet()) { String py = content.getKey().trim(); Integer voice = content.getValue(); pinyin = pinyin + py + voice+","; } //拼音加入结束 去除最后一个, pinyin = pinyin.substring(0, pinyin.length()-1); //汉字和拼音都已经处理完毕 进入单个词语写入模块 --方法 抽取出来公用 String line = midWriter(word, pinyin, merge); if(line != null){ writer.write(line); writer.newLine(); } } writer.flush(); return true; } }else{ throw new Exception("请配置用户词典绝对路径"); } } catch (Exception e) { e.printStackTrace(); }finally { try { if(writer!=null) writer.close(); PYWriterUtils.reloadText(); } catch (IOException e) { e.printStackTrace(); } } return false; } /** * * @描写叙述:方法抽取--对单个字进行处理 * @return * @return String 组合后的一行要写入的 形式 E4A3 (ang3,yi1,wang3) * @exception * @createTime:2016年4月7日 * @author: songqinghu */ private static String midWriter(String word ,String pinyin,boolean merge){ if(word !=null && word.length()>0){ char c = word.toCharArray()[0]; if(c>128){//假设是汉字 String unicode = Integer.toHexString(c).toUpperCase();//变为16进制 if(merge){//假设要合并 须要先取出来 在合并 取不到还要处理一下 //获取到总的资源池 Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable(); //假设存在该词语的拼音 if(trie.get(unicode)!=null &&trie.get(unicode).getPinyin()!=null){ String before = trie.get(unicode).getPinyin(); //对已经处在字符串进行处理 --(xxx) (xxxx,xxxx) before = before.trim().substring(1, before.trim().length()-1);//去除() //假设存在了 就不再反复加入了 String[] splits = before.split(","); String[] strings = pinyin.trim().split(","); Set
temp = new HashSet
(); //去反复 for (String split : splits) { temp.add(split.trim()); } for (String string : strings) { temp.add(string); } pinyin =""; for (String tem : temp) { pinyin = pinyin + tem+Field.COMMA; } pinyin = pinyin.substring(0,pinyin.length()-1);//去除最后一个, } //不存在 直接 保持拼音不变 } //组合成写入的格式 pinyin = addSymbol(pinyin); return unicode + Field.SPACE+pinyin; } } return null; } /** * * @描写叙述:默认批量写入功能 * @param contents * @return * @return boolean * @exception * @createTime:2016年4月7日 * @author: songqinghu */ public static boolean defaultWriterBatch(Map
> contents){ return writerBatch(contents, true, true); } /** * * @描写叙述:当自己定义文件须要更新时,调用方法 又一次载入自己的配置文件 * @return * @return boolean * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws IOException */ public static boolean reloadText() throws IOException{ if (path != null) { File userMultiPinyinFile = new File(path); FileInputStream is = new FileInputStream(userMultiPinyinFile); if(is !=null){ ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is); return true; } } return false; } /** * 加入操作符号 */ private static String addSymbol(String pinyin){ return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET; } class Field { static final String LEFT_BRACKET = "("; static final String RIGHT_BRACKET = ")"; static final String COMMA = ","; static final String SPACE = " "; }}
将jar和原有suggestproject进行整合出现故障---无法写入jar中自己定义文件(jar中的文件仅仅能读取)===>解决思路 将用户自己定义词典放在执行的warproject中须要手动指定一次文件位置---大概功能已经能够整合进入项目中使用了package cn.com.mx.gome.suggest.controller;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.stereotype.Controller;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.ResponseBody;import com.gome.mx.plus.pinyin.ext.PYReadUtils;import com.gome.mx.plus.pinyin.ext.PYWriterUtils;import cn.com.mx.gome.search.core.common.ResultData;import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;/** *  * @author songqinghu * 对pinyin4j中的用户自己定义词典库进行操作 */@Controller@RequestMapping("/suggest/pinyin")public class PinYinController {    private  Logger logger = LoggerFactory.getLogger(PinYinController.class);    /**     *      * @描写叙述:获取指定汉字的现存词库中的拼音 --须要POST请求     * @return     * @return ResultData
* @exception * @createTime:2016年4月7日 * @author: songqinghu */ @RequestMapping("/getpy") @ResponseBody public ResultData
> getPinYin(String word){ ResultData
> result = new ResultData
>(); try { if(word != null && word.trim().length()>0){ String[] fullPY = PYReadUtils.getFullPY(word); if(fullPY!=null && fullPY.length>0){ ArrayList
list = new ArrayList
(); for (String string : fullPY) { list.add(string); } result.setData(list); result.setSuccess(true); return result; } } } catch (BadHanyuPinyinOutputFormatCombination e) { logger.error("",e); } result.setSuccess(false); return result; } /** * * @描写叙述:加入一个汉字的映射关系到用户自定库中 * @param word * @param pinyin * @param voice * @return * @return ResultData
* @exception * @createTime:2016年4月7日 * @author: songqinghu */ @RequestMapping("/addpy") @ResponseBody public ResultData
addPinYin(String word,String pinyin,Integer voice){ ResultData
result = new ResultData
(); if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){ try { boolean flag = PYWriterUtils.dufaultWriter(word, pinyin, voice); result.setData(flag); result.setSuccess(true); return result; } catch (Exception e) { e.printStackTrace(); logger.error("",e); } } result.setSuccess(false); return result; } @RequestMapping("/test") @ResponseBody public ResultData
addtest(String word,String pinyin,Integer voice){ Map
> contents = new HashMap
>(); HashMap
content = new HashMap
(); content.put("test", 1); content.put("tttt", 2); content.put("ling", 1); contents.put("〇", content); // setDicPath(); ResultData
result = new ResultData
(); if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){ try { boolean flag = PYWriterUtils.defaultWriterBatch(contents); result.setData(flag); result.setSuccess(true); return result; } catch (Exception e) { e.printStackTrace(); logger.error("",e); } } result.setSuccess(false); return result; }}
warproject使用  SSM架构 项目启动时载入词库所在位置工具类package cn.com.mx.gome.suggest.component;/** * 项目启动时载入指定的pinyin4j用户扩展字典 * @author songqinghu * */import javax.annotation.PostConstruct;import org.springframework.beans.factory.annotation.Value;import org.springframework.stereotype.Component;import com.gome.mx.plus.pinyin.ext.PYWriterUtils;import cn.com.mx.gome.suggest.controller.PinYinController;@Componentpublic class PinYinDataSourceFile {    @Value("${PINYIN_FILE_PATH}")    private String path;    /**     *      * @描写叙述:项目启动时 此类载入完毕后运行此方法完毕用户自己定义pinyin4j字典配置的载入     * @return void     * @exception     * @createTime:2016年4月7日     * @author: songqinghu     */    @PostConstruct    private void setFilePath(){        String pathFile = PinYinDataSourceFile.class.getResource(path).getPath();        PYWriterUtils.setPath(pathFile);    }}
最后附上 改写后的pinyin4j源代码
链接:http://pan.baidu.com/s/1skUD8dv password:fhy4

转载于:https://www.cnblogs.com/yutingliuyl/p/7249997.html

你可能感兴趣的文章
05_maven生命周期
查看>>
我的友情链接
查看>>
Swift2.0语言教程之函数的返回值与函数类型
查看>>
泡沫学员CSS切图学习总结
查看>>
centos 学习日记 文件隐藏属性 chattr lsattr
查看>>
redhat yum 失败
查看>>
log4j2日志框架使用简单概述
查看>>
新手处理事故之误删boot目录以及更严重的删除操作
查看>>
bootstap-table 只显示列名和表格不显示数据
查看>>
linux 网站架设调优Apache(四)
查看>>
vi的使用
查看>>
当你需要处理XML文档时
查看>>
【Python之旅】第五篇(一):Python Socket通信原理
查看>>
pycharm 5注册
查看>>
java-buildpack源码分析之Release
查看>>
网络安全市场分类综述
查看>>
Puppet安装(一):安装服务端和客户端
查看>>
STL源码剖析之算法:partial_sort & partial_sort_copy
查看>>
10-8 项目整体管理、项目范围管理
查看>>
iptables实现网络防火墙及地址转换
查看>>