Java/Email/Spam — различия между версиями

Текущая версия на 09:17, 1 июня 2010

Determines probability that text contains Spam

   <source lang="java">

/****************************************************************

* Licensed to the Apache Software Foundation (ASF) under one   *
* or more contributor license agreements.  See the NOTICE file *
* distributed with this work for additional information        *
* regarding copyright ownership.  The ASF licenses this file   *
* to you under the Apache License, Version 2.0 (the            *
* "License"); you may not use this file except in compliance   *
* with the License.  You may obtain a copy of the License at   *
*                                                              *
*   http://www.apache.org/licenses/LICENSE-2.0                 *
*                                                              *
* Unless required by applicable law or agreed to in writing,   *
* software distributed under the License is distributed on an  *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
* KIND, either express or implied.  See the License for the    *
* specific language governing permissions and limitations      *
* under the License.                                           *
****************************************************************/

// Revised from apache james import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Collection; import java.util.ArrayList; import java.io.Reader; import java.io.StreamTokenizer; import java.io.StringReader; /**

*

Determines probability that text contains Spam.

*

Based upon Paul Grahams" .

*

Sample method usage:

*

Use: * void addHam(Reader) * and * void addSpam(Reader) * * methods to build up the Maps of ham & spam tokens/occurrences. * Both addHam and addSpam assume they"re reading one message at a time, * if you feed more than one message per call, be sure to adjust the * appropriate message counter: hamMessageCount or spamMessageCount. * * Then...

*

Use: * void buildCorpus() * * to build the final token/probabilities Map. * * Use your own methods for persistent storage of either the individual * ham/spam corpus & message counts, and/or the final corpus. * * Then you can...

*

Use: * double computeSpamProbability(Reader) * * to determine the probability that a particular text contains spam. * A returned result of 0.9 or above is an indicator that the text was * spam.

*

If you use persistent storage, use: * void setCorpus(Map) * * before calling computeSpamProbability.

*
* @version CVS $Revision: $ $Date: $
* @since 2.3.0
*/

public class BayesianAnalyzer {

   /**
    * Number of "interesting" tokens to use to compute overall
    * spamminess probability.
    */
   private final static int MAX_INTERESTING_TOKENS = 15;
   
   /**
    * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
    * spamminess probability.
    */
   private final static double INTERESTINGNESS_THRESHOLD = 0.46;
   
   /**
    * Default token probability to use when a token has not been
    * encountered before.
    */
   private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
   
   /**
    * Map of ham tokens and their occurrences.
    *
    * String key
    * Integer value
    */
   private Map hamTokenCounts = new HashMap();
   
   /**
    * Map of spam tokens and their occurrences.
    *
    * String key
    * Integer value
    */
   private Map spamTokenCounts = new HashMap();
   
   /**
    * Number of ham messages analyzed.
    */
   private int hamMessageCount = 0;
   
   /**
    * Number of spam messages analyzed.
    */
   private int spamMessageCount = 0;
   
   /**
    * Final token/probability corpus.
    *
    * String key
    * Double value
    */
   private Map corpus = new HashMap();
   
   /**
    * Inner class for managing Token Probability Strengths during the
    * computeSpamProbability phase.
    *
    * By probability strength we mean the absolute distance of a
    * probability from the middle value 0.5.
    *
    * It implements Comparable so that it"s sorting is automatic.
    */
   private class TokenProbabilityStrength
   implements Comparable {
       /**
        * Message token.
        */
       String token = null;
       
       /**
        * Token"s computed probability strength.
        */
       double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
       
       /**
        * Force the natural sort order for this object to be high-to-low.
        *
        * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
        *                                this instance with.
        *
        * @return The result of the comparison (before, equal, after).
        */
       public final int compareTo(Object anotherTokenProbabilityStrength) {
           int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
           if (result == 0) {
               return this.token.rupareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
           } else {
               return result;
           }
       }
       
       /**
        * Simple toString () implementation mostly for debugging purposes.
        *
        * @return String representation of this object.
        */
       public String toString() {
           StringBuffer sb = new StringBuffer(30);
           
           sb.append(token)
           .append("=")
           .append(strength);
           
           return sb.toString();
       }
   }
   
   /**
    * Basic class constructor.
    */
   public BayesianAnalyzer() {
   }
   
   /**
    * Public setter for the hamTokenCounts Map.
    *
    * @param hamTokenCounts The new ham Token counts Map.
    */
   public void setHamTokenCounts(Map hamTokenCounts) {
       this.hamTokenCounts = hamTokenCounts;
   }
   
   /**
    * Public getter for the hamTokenCounts Map.
    */
   public Map getHamTokenCounts() {
       return this.hamTokenCounts;
   }
   
   /**
    * Public setter for the spamTokenCounts Map.
    *
    * @param spamTokenCounts The new spam Token counts Map.
    */
   public void setSpamTokenCounts(Map spamTokenCounts) {
       this.spamTokenCounts = spamTokenCounts;
   }
   
   /**
    * Public getter for the spamTokenCounts Map.
    */
   public Map getSpamTokenCounts() {
       return this.spamTokenCounts;
   }
   
   /**
    * Public setter for spamMessageCount.
    *
    * @param spamMessageCount The new spam message count.
    */
   public void setSpamMessageCount(int spamMessageCount) {
       this.spamMessageCount = spamMessageCount;
   }
   
   /**
    * Public getter for spamMessageCount.
    */
   public int getSpamMessageCount() {
       return this.spamMessageCount;
   }
   
   /**
    * Public setter for hamMessageCount.
    *
    * @param hamMessageCount The new ham message count.
    */
   public void setHamMessageCount(int hamMessageCount) {
       this.hamMessageCount = hamMessageCount;
   }
   
   /**
    * Public getter for hamMessageCount.
    */
   public int getHamMessageCount() {
       return this.hamMessageCount;
   }
   
   /**
    * Clears all analysis repositories and counters.
    */
   public void clear() {
       corpus.clear();
       
       tokenCountsClear();
       
       hamMessageCount = 0;
       spamMessageCount = 0;
   }
   
   /**
    * Clears token counters.
    */
   public void tokenCountsClear() {
       hamTokenCounts.clear();
       spamTokenCounts.clear();
   }
   
   /**
    * Public setter for corpus.
    *
    * @param corpus The new corpus.
    */
   public void setCorpus(Map corpus) {
       this.corpus = corpus;
   }
   
   /**
    * Public getter for corpus.
    */
   public Map getCorpus() {
       return this.corpus;
   }
   
   /**
    * Builds the corpus from the existing ham & spam counts.
    */
   public void buildCorpus() {
       //Combine the known ham & spam tokens.
       Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
       set.addAll(hamTokenCounts.keySet());
       set.addAll(spamTokenCounts.keySet());
       Map tempCorpus = new HashMap(set.size());
       
       //Iterate through all the tokens and compute their new
       //individual probabilities.
       Iterator i = set.iterator();
       while (i.hasNext()) {
           String token = (String) i.next();
           tempCorpus.put(token, new Double(computeProbability(token)));
       }
       setCorpus(tempCorpus);
   }
   
   /**
    * Adds a message to the ham list.
    * @param stream A reader stream on the ham message to analyze
    * @throws IOException If any error occurs
    */
   public void addHam(Reader stream)
   throws java.io.IOException {
       addTokenOccurrences(stream, hamTokenCounts);
       hamMessageCount++;
   }
   
   /**
    * Adds a message to the spam list.
    * @param stream A reader stream on the spam message to analyze
    * @throws IOException If any error occurs
    */
   public void addSpam(Reader stream)
   throws java.io.IOException {
       addTokenOccurrences(stream, spamTokenCounts);
       spamMessageCount++;
   }
   
   /**
    * Computes the probability that the stream contains SPAM.
    * @param stream The text to be analyzed for Spamminess.
    * @return A 0.0 - 1.0 probability
    * @throws IOException If any error occurs
    */
   public double computeSpamProbability(Reader stream)
   throws java.io.IOException {
       //Build a set of the tokens in the Stream.
       Set tokens = parse(stream);
       
       // Get the corpus to use in this run
       // A new corpus may be being built in the meantime
       Map workCorpus = getCorpus();
       
       //Assign their probabilities from the Corpus (using an additional
       //calculation to determine spamminess).
       SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
       
       //Compute and return the overall probability that the
       //stream is SPAM.
       return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
   }
   
   /**
    * Parses a stream into tokens, and updates the target Map
    * with the token/counts.
    *
    * @param stream
    * @param target
    */
   private void addTokenOccurrences(Reader stream, Map target)
   throws java.io.IOException {
       String token;
       String header = "";
       
       //Update target with the tokens/count encountered.
       while ((token = nextToken(stream)) != null) {
           boolean endingLine = false;
           if (token.length() > 0 && token.charAt(token.length() - 1) == "\n") {
               endingLine = true;
               token = token.substring(0, token.length() - 1);
           }
           
           if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
               if (token.equals("From:")
               || token.equals("Return-Path:")
               || token.equals("Subject:")
               || token.equals("To:")
               ) {
                   header = token;
                   if (!endingLine) {
                       continue;
                   }
               }
               
               token = header + token;
               
               Integer value = null;
               
               if (target.containsKey(token)) {
                   value = new Integer(((Integer) target.get(token)).intValue() + 1);
               } else {
                   value = new Integer(1);
               }
               
               target.put(token, value);
           }
           
           if (endingLine) {
               header = "";
           }
       }
   }
   
   /**
    * Parses a stream into tokens, and returns a Set of
    * the unique tokens encountered.
    *
    * @param stream
    * @return Set
    */
   private Set parse(Reader stream)
   throws java.io.IOException {
       Set tokens = new HashSet();
       String token;
       String header = "";
       
       //Build a Map of tokens encountered.
       while ((token = nextToken(stream)) != null) {
           boolean endingLine = false;
           if (token.length() > 0 && token.charAt(token.length() - 1) == "\n") {
               endingLine = true;
               token = token.substring(0, token.length() - 1);
           }
           
           if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
               if (token.equals("From:")
               || token.equals("Return-Path:")
               || token.equals("Subject:")
               || token.equals("To:")
               ) {
                   header = token;
                   if (!endingLine) {
                       continue;
                   }
               }
               
               token = header + token;
               
               tokens.add(token);
           }
           
           if (endingLine) {
               header = "";
           }
       }
       
       //Return the unique set of tokens encountered.
       return tokens;
   }
   
   private String nextToken(Reader reader) throws java.io.IOException {
       StringBuffer token = new StringBuffer();
       int i;
       char ch, ch2;
       boolean previousWasDigit = false;
       boolean tokenCharFound = false;
       
       if (!reader.ready()) {
           return null;
       }
       
       while ((i = reader.read()) != -1) {
           
           ch = (char) i;
           
           if (ch == ":") {
               String tokenString = token.toString() + ":";
               if (tokenString.equals("From:")
               || tokenString.equals("Return-Path:")
               || tokenString.equals("Subject:")
               || tokenString.equals("To:")
               ) {
                   return tokenString;
               }
           }
           
           if (Character.isLetter(ch)
           || ch == "-"
           || ch == "$"
           || ch == "\u20AC" // the EURO symbol
           || ch == "!"
           || ch == "\""
           ) {
               tokenCharFound = true;
               previousWasDigit = false;
               token.append(ch);
           } else if (Character.isDigit(ch)) {
               tokenCharFound = true;
               previousWasDigit = true;
               token.append(ch);
           } else if (previousWasDigit && (ch == "." || ch == ",")) {
               reader.mark(1);
               previousWasDigit = false;
               i = reader.read();
               if (i == -1) {
                   break;
               }
               ch2 = (char) i;
               if (Character.isDigit(ch2)) {
                   tokenCharFound = true;
                   previousWasDigit = true;
                   token.append(ch);
                   token.append(ch2);
               } else {
                   reader.reset();
                   break;
               }
           } else if (ch == "\r") {
               // cr found, ignore
           } else if (ch == "\n") {
               // eol found
               tokenCharFound = true;
               previousWasDigit = false;
               token.append(ch);
               break;
           } else if (tokenCharFound) {
               break;
           }
       }
       
       if (tokenCharFound) {
           //          System.out.println("Token read: " + token);
           return token.toString();
       } else {
           return null;
       }
   }
   
   /**
    * Compute the probability that "token" is SPAM.
    *
    * @param token
    * @return  The probability that the token occurs within spam.
    */
   private double computeProbability(String token) {
       double hamFactor  = 0;
       double spamFactor = 0;
       
       boolean foundInHam = false;
       boolean foundInSpam = false;
       
       double minThreshold = 0.01;
       double maxThreshold = 0.99;
       
       if (hamTokenCounts.containsKey(token)) {
           foundInHam = true;
       }
       
       if (spamTokenCounts.containsKey(token)) {
           foundInSpam = true;
       }
       
       if (foundInHam) {
           hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
           if (!foundInSpam) {
               minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
           }
       }
       
       if (foundInSpam) {
           spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
           if (!foundInHam) {
               maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
           }
       }
       
       if ((hamFactor + spamFactor) < 5) {
           //This token hasn"t been seen enough.
           return 0.4;
       }
       
       double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
       double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
       
       return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
   }
   
   /**
    * Returns a SortedSet of TokenProbabilityStrength built from the
    * Corpus and the tokens passed in the "tokens" Set.
    * The ordering is from the highest strength to the lowest strength.
    *
    * @param tokens
    * @param workCorpus
    * @return  SortedSet of TokenProbabilityStrength objects.
    */
   private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
       //Convert to a SortedSet of token probability strengths.
       SortedSet tokenProbabilityStrengths = new TreeSet();
       
       Iterator i = tokens.iterator();
       while (i.hasNext()) {
           TokenProbabilityStrength tps = new TokenProbabilityStrength();
           
           tps.token = (String) i.next();
           
           if (workCorpus.containsKey(tps.token)) {
               tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
           }
           else {
               //This token has never been seen before,
               //we"ll give it initially the default probability.
               Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
               tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
               boolean isTokenDegeneratedFound = false;
               
               Collection degeneratedTokens = buildDegenerated(tps.token);
               Iterator iDegenerated = degeneratedTokens.iterator();
               String tokenDegenerated = null;
               double strengthDegenerated;
               while (iDegenerated.hasNext()) {
                   tokenDegenerated = (String) iDegenerated.next();
                   if (workCorpus.containsKey(tokenDegenerated)) {
                       Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
                       strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
                       if (strengthDegenerated > tps.strength) {
                           isTokenDegeneratedFound = true;
                           tps.strength = strengthDegenerated;
                           corpusProbability = probabilityTemp;
                       }
                   }
               }
               // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
               if (isTokenDegeneratedFound) {
                   synchronized(workCorpus) {
                       workCorpus.put(tps.token, corpusProbability);
                   }
               }
           }
           
           tokenProbabilityStrengths.add(tps);
       }
       
       return tokenProbabilityStrengths;
   }
   
   private Collection buildDegenerated(String fullToken) {
       ArrayList tokens = new ArrayList();
       String header;
       String token;
       
       // look for a header string termination
       int headerEnd = fullToken.indexOf(":");
       if (headerEnd >= 0) {
           header = fullToken.substring(0, headerEnd);
           token = fullToken.substring(headerEnd);
       } else {
           header = "";
           token = fullToken;
       }
       
       int end = token.length();
       do {
           if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
               tokens.add(header + token.substring(0, end).toLowerCase());
               if (header.length() > 0) {
                   tokens.add(token.substring(0, end).toLowerCase());
               }
           }
           if (end > 1 && token.charAt(0) >= "A" && token.charAt(0) <= "Z") {
               tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
               if (header.length() > 0) {
                   tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
               }
           }
           
           if (token.charAt(end - 1) != "!") {
               break;
           }
           
           end--;
           
           tokens.add(header + token.substring(0, end));
           if (header.length() > 0) {
               tokens.add(token.substring(0, end));
           }
       } while (end > 0);
       
       return tokens;
   }
   
   /**
    * Compute the spamminess probability of the interesting tokens in
    * the tokenProbabilities SortedSet.
    *
    * @param tokenProbabilities
    * @param workCorpus
    * @return  Computed spamminess.
    */
   private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
       double p = 1.0;
       double np = 1.0;
       double tempStrength = 0.5;
       int count = MAX_INTERESTING_TOKENS;
       Iterator iterator = tokenProbabilityStrengths.iterator();
       while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
           TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
           tempStrength = tps.strength;
           
           //      System.out.println(tps);
           
           double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
           Double theDoubleObject = (Double) workCorpus.get(tps.token);
           // if either the original token or a degeneration was found use the double value, otherwise use the default
           if (theDoubleObject != null) {
               theDoubleValue = theDoubleObject.doubleValue();
           }
           p *= theDoubleValue;
           np *= (1.0 - theDoubleValue);
           // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
       }
       
       return (p / (p + np));
   }
   
   private boolean allSameChar(String s) {
       if (s.length() < 2) {
           return false;
       }
       
       char c = s.charAt(0);
       
       for (int i = 1; i < s.length(); i++) {
           if (s.charAt(i) != c) {
               return false;
           }
       }
       return true;
   }
   
   private boolean allDigits(String s) {
       for (int i = 0; i < s.length(); i++) {
           if (!Character.isDigit(s.charAt(i))) {
               return false;
           }
       }
       return true;
   }

}

 </source>

Java/Email/Spam — различия между версиями

Текущая версия на 09:17, 1 июня 2010

Determines probability that text contains Spam

Навигация

Персональные инструменты

Пространства имён

Варианты

Просмотры

Ещё

Поиск

Разделы

Навигация

Инструменты

Версия 21:01, 31 мая 2010 (просмотреть исходный код) (обсуждение)	Текущая версия на 09:17, 1 июня 2010 (просмотреть исходный код) Admin (обсуждение \| вклад) м (1 версия)
(нет различий)