有 Java 编程相关的问题?

你可以在下面搜索框中键入要查询的问题!

java波特词干分析器代码

我对java有点陌生。我正在修NLP课程。我想知道如何运行输入文件 关于波特词干分析器的java代码


共 (1) 个答案

  1. # 1 楼答案

    下面的类名为PorterAlgo,具有各种词干分析功能

    package com.mycompany.algo;
    
    class NewString {
      public String str;
    
      NewString() {
         str = "";
      }
    }
    
    public class PorterAlgo {
    
      String Clean( String str ) {
         int last = str.length();
    
         new Character( str.charAt(0) );
         String temp = "";
    
         for ( int i=0; i < last; i++ ) {
             if ( Character.isLetterOrDigit( str.charAt(i) ) )
                temp += str.charAt(i);
         }
    
         return temp;
      } //clean
    
      boolean hasSuffix( String word, String suffix, NewString stem ) {
    
         String tmp = "";
    
         if ( word.length() <= suffix.length() )
            return false;
         if (suffix.length() > 1) 
            if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )
               return false;
    
         stem.str = "";
    
         for ( int i=0; i<word.length()-suffix.length(); i++ )
             stem.str += word.charAt( i );
         tmp = stem.str;
    
         for ( int i=0; i<suffix.length(); i++ )
             tmp += suffix.charAt( i );
    
         if ( tmp.compareTo( word ) == 0 )
            return true;
         else
            return false;
      }
    
      boolean vowel( char ch, char prev ) {
         switch ( ch ) {
            case 'a': case 'e': case 'i': case 'o': case 'u': 
      return true;
    case 'y': {
    
      switch ( prev ) {
        case 'a': case 'e': case 'i': case 'o': case 'u': 
                  return false;
    
                default: 
                  return true;
              }
            }
    
            default : 
              return false;
         }
      }
    
      int measure( String stem ) {
    
        int i=0, count = 0;
        int length = stem.length();
    
        while ( i < length ) {
           for ( ; i < length ; i++ ) {
               if ( i > 0 ) {
                  if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
                     break;
               }
               else {  
                  if ( vowel(stem.charAt(i),'a') )
                break; 
           }
       }
    
       for ( i++ ; i < length ; i++ ) {
           if ( i > 0 ) {
              if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
                  break;
              }
           else {  
              if ( !vowel(stem.charAt(i),'?') )
                 break;
           }
       } 
      if ( i < length ) {
         count++;
         i++;
      }
    } //while
    
        return(count);
      }
    
      boolean containsVowel( String word ) {
    
         for (int i=0 ; i < word.length(); i++ )
             if ( i > 0 ) {
                if ( vowel(word.charAt(i),word.charAt(i-1)) )
                   return true;
             }
             else {  
                if ( vowel(word.charAt(0),'a') )
                   return true;
             }
    
         return false;
      }
    
      boolean cvc( String str ) {
         int length=str.length();
    
         if ( length < 3 )
            return false;
    
         if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
            && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')
    && (vowel(str.charAt(length-2),str.charAt(length-3))) ) {
    
    if (length == 3) {
       if (!vowel(str.charAt(0),'?')) 
                  return true;
               else
                  return false;
            }
            else {
               if (!vowel(str.charAt(length-3),str.charAt(length-4)) ) 
                  return true; 
               else
                  return false;
            } 
         }   
    
         return false;
      }
    
      String step1( String str ) {
    
         NewString stem = new NewString();
    
         if ( str.charAt( str.length()-1 ) == 's' ) {
    if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){
       String tmp = "";
       for (int i=0; i<str.length()-2; i++)
           tmp += str.charAt(i);
       str = tmp;
    }
    else {
       if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {
          str = "";
          return str;
       }
       if ( str.charAt( str.length()-2 ) != 's' ) {
          String tmp = "";
              for (int i=0; i<str.length()-1; i++)
                  tmp += str.charAt(i);
              str = tmp;
           }
        }  
     }
    
     if ( hasSuffix( str,"eed",stem ) ) {
       if ( measure( stem.str ) > 0 ) {
          String tmp = "";
              for (int i=0; i<str.length()-1; i++)
                  tmp += str.charAt( i );
              str = tmp;
           }
     }
     else {  
        if (  (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) ) { 
       if (containsVowel( stem.str ))  {
    
          String tmp = "";
          for ( int i = 0; i < stem.str.length(); i++)
              tmp += str.charAt( i );
          str = tmp;
          if ( str.length() == 1 )
             return str;
    
          if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {
             str += "e";
    
          }
          else {   
             int length = str.length(); 
             if ( (str.charAt(length-1) == str.charAt(length-2)) 
                && (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {
    
                tmp = "";
                for (int i=0; i<str.length()-1; i++)
                    tmp += str.charAt(i);
                str = tmp;
             }
             else
                if ( measure( str ) == 1 ) {
                   if ( cvc(str) ) 
                      str += "e";
                    }
              }
           }
        }
     }
    
     if ( hasSuffix(str,"y",stem) ) 
    if ( containsVowel( stem.str ) ) {
       String tmp = "";
       for (int i=0; i<str.length()-1; i++ )
           tmp += str.charAt(i);
       str = tmp + "i";
            }
         return str;  
      }
    
      String step2( String str ) {
    
         String[][] suffixes = { { "ational", "ate" },
                                { "tional",  "tion" },
                                { "enci",    "ence" },
                                { "anci",    "ance" },
                                { "izer",    "ize" },
                                { "iser",    "ize" },
                                { "abli",    "able" },
                                { "alli",    "al" },
                                { "entli",   "ent" },
                                { "eli",     "e" },
                                { "ousli",   "ous" },
                                { "ization", "ize" },
                                { "isation", "ize" },
                                { "ation",   "ate" },
                                { "ator",    "ate" },
                                { "alism",   "al" },
                                { "iveness", "ive" },
                                { "fulness", "ful" },
                                { "ousness", "ous" },
                                { "aliti",   "al" },
                                { "iviti",   "ive" },
                                { "biliti",  "ble" }};
         NewString stem = new NewString();
    
    
         for ( int index = 0 ; index < suffixes.length; index++ ) {
             if ( hasSuffix ( str, suffixes[index][0], stem ) ) {
                if ( measure ( stem.str ) > 0 ) {
                   str = stem.str + suffixes[index][1];
                   return str;
                }
             }
         }
    
         return str;
      }
    
      String step3( String str ) {
    
            String[][] suffixes = { { "icate", "ic" },
                                   { "ative", "" },
                                   { "alize", "al" },
                                   { "alise", "al" },
                                   { "iciti", "ic" },
                                   { "ical",  "ic" },
                                   { "ful",   "" },
                                   { "ness",  "" }};
            NewString stem = new NewString();
    
            for ( int index = 0 ; index<suffixes.length; index++ ) {
                if ( hasSuffix ( str, suffixes[index][0], stem ))
                   if ( measure ( stem.str ) > 0 ) {
                      str = stem.str + suffixes[index][1];
                      return str;
                   }
            }
            return str;
      }
    
      String step4( String str ) {
    
         String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",
                       "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};
    
         NewString stem = new NewString();
    
         for ( int index = 0 ; index<suffixes.length; index++ ) {
             if ( hasSuffix ( str, suffixes[index], stem ) ) {
    
                if ( measure ( stem.str ) > 1 ) {
                   str = stem.str;
                   return str;
                }
             }
         }
         return str;
      }
    
      String step5( String str ) {
    
         if ( str.charAt(str.length()-1) == 'e' ) { 
    if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
       String tmp = "";
       for ( int i=0; i<str.length()-1; i++ ) 
           tmp += str.charAt( i );
       str = tmp;
    }
    else
       if ( measure(str) == 1 ) {
          String stem = "";
              for ( int i=0; i<str.length()-1; i++ ) 
                  stem += str.charAt( i );
    
              if ( !cvc(stem) )
                 str = stem;
           }
     }
    
     if ( str.length() == 1 )
        return str;
     if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) )
    if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
       String tmp = "";
               for ( int i=0; i<str.length()-1; i++ ) 
                   tmp += str.charAt( i );
               str = tmp;
            } 
         return str;
      }
    
      String stripPrefixes ( String str) {
    
         String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};
    
     int last = prefixes.length;
     for ( int i=0 ; i<last; i++ ) {
         if ( str.startsWith( prefixes[i] ) ) {
            String temp = "";
                for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )
                    temp += str.charAt( j+prefixes[i].length() );
                return temp;
             }
         }
    
         return str;
      }
    
    
      private String stripSuffixes( String str ) {
    
         str = step1( str );
         if ( str.length() >= 1 )
            str = step2( str );
         if ( str.length() >= 1 )
            str = step3( str );
         if ( str.length() >= 1 )
            str = step4( str );
         if ( str.length() >= 1 )
            str = step5( str );
    
         return str; 
      }
    
    
      public String stripAffixes( String str ) {
    
        str = str.toLowerCase();
        str = Clean(str);
    
        if (( str != "" ) && (str.length() > 2)) {
       str = stripPrefixes(str);
    
       if (str != "" ) 
          str = stripSuffixes(str);
    
    }   
    
    return str;
    } //stripAffixes
    
    } //class
    

    下面给出的是PorterCheck类。爪哇

    package com.mycompany.algo;
    
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.*;
    
    public class PorterCheck {
        private static final String DEFAULT_TEST_FILE = "C:/Users/vaibhav/Desktop/rev.txt";
        public static void main(String args[]) throws IOException{
            PorterAlgo pa = new PorterAlgo();
    
            //checks for vowels in a given string
            System.out.println(pa.containsVowel("vaibhav"));
    
            //removes special characters
            System.out.println(pa.Clean("vaibhav's book"));
    
            //check for a given suffix
            NewString stem = new NewString();
            System.out.println(pa.hasSuffix("corresponding","ing",stem));
    
            //stemming the words
            ArrayList<String> tok = new ArrayList<String>();
            String[] tokens = {"normalize","technical","education"};
            for (String x: tokens){
                tok.add(x);
            }
            System.out.println(completeStem(tok));
    
            String fileName = ((args.length > 0) ? args[0] : DEFAULT_TEST_FILE);
            FileReader fileReader = new FileReader(new File(fileName));
            FileTokenizer fileTokenizer = new FileTokenizer();
            List<String> tokens1 = fileTokenizer.tokenize(fileReader);
    
            System.out.println("Tokenizing the input file:");
            System.out.print(completeStem(tokens1));
        }
    
        //method to completely stem the words in an array list
        public static ArrayList<String> completeStem(List<String> tokens1){
            PorterAlgo pa = new PorterAlgo();
            ArrayList<String> arrstr = new ArrayList<String>();
            for (String i : tokens1){
                String s1 = pa.step1(i);
                String s2 = pa.step2(s1);
                String s3= pa.step3(s2);
                String s4= pa.step4(s3);
                String s5= pa.step5(s4);
                arrstr.add(s5);
            }
            return arrstr;
        }
    
        //method to tokenize a file
        public static ArrayList<String> fileTokenizer(){
            StringTokenizer strtoken = new StringTokenizer("this is a book");
            ArrayList<String> filetoken = new ArrayList<String>();
            while(strtoken.hasMoreElements()){
                filetoken.add(strtoken.nextToken());
            }
            return filetoken;
        }
    }
    

    希望这对你有帮助:D