Commit e41e070f by Hut

many much

parent 6763731a
...@@ -19,27 +19,28 @@ public class Builder { ...@@ -19,27 +19,28 @@ public class Builder {
this.data = data; this.data = data;
} }
public Collection<Map.Entry<Prefix, Decission>> random() { public Sentence random() {
return produce(p -> nextRandom(p)); return produce(p -> nextRandom(p));
} }
public Collection<Map.Entry<Prefix, Decission>> average() { public Sentence average() {
return produce(p -> data.fetch(p).average()); return produce(p -> data.fetch(p).average());
} }
private Collection<Decission> produce( private Sentence produce(
Function<Prefix, Decission> producer) { Function<Prefix, Decission> producer) {
List<Decission> result = new LinkedList<>(); Sentence sentence = new Sentence(data);
Prefix p = initPrefix(); Prefix p = initPrefix();
Decission d = new Decission(Token.START, 0, p, null); Decission d = new Decission(Token.START, 0);
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 1000; i++) {
if (d.getToken() == Token.END) if (Token.END.equals(d.getToken()))
break; break;
d = producer.apply(p); d = producer.apply(p);
result.add(new AbstractMap.SimpleEntry<>(p, d)); d.setP(p);
sentence.add(d);
p = p.slide(d.getToken(), prefix_length); p = p.slide(d.getToken(), prefix_length);
} }
return result; return sentence;
} }
private Prefix initPrefix() { private Prefix initPrefix() {
...@@ -51,7 +52,9 @@ public class Builder { ...@@ -51,7 +52,9 @@ public class Builder {
if (l == null) { if (l == null) {
throw new NullPointerException(String.format("could not find a lookup for %s", prefix)); throw new NullPointerException(String.format("could not find a lookup for %s", prefix));
} }
return l.forRandom(nextRandomNumber()); Decission decission = l.forRandom(nextRandomNumber());
decission.setLookup(l);
return decission;
} }
private double nextRandomNumber() { private double nextRandomNumber() {
......
package markov; package markov;
import java.io.Serializable;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
// FIXME rename // FIXME rename
public class Data { // TODO add builder and move add()
public class Data implements Serializable{
private final int prefixLength; private final int prefixLength;
private final Map<Prefix, Lookup> data = new HashMap<>(); private final Map<Prefix, Lookup> data = new HashMap<>();
......
package markov; package markov;
public class Glyph { import java.io.Serializable;
public class Glyph implements Serializable {
public enum Type { public enum Type {
word, punctuation, control, whitespace, empty, other word, punctuation, control, whitespace, empty, other
} }
...@@ -31,8 +33,6 @@ public class Glyph { ...@@ -31,8 +33,6 @@ public class Glyph {
if (type != glyph.type) { if (type != glyph.type) {
return false; return false;
}else if(type == Type.control){
return this == o;
} }
return content != null ? content.equals(glyph.content) : glyph.content == null; return content != null ? content.equals(glyph.content) : glyph.content == null;
} }
......
package markov; package markov;
import java.io.Serializable;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class Lookup { public class Lookup implements Serializable {
private final LinkedHashMap<Token, Integer> tokens = new LinkedHashMap<Token, Integer>(); private final LinkedHashMap<Token, Integer> tokens = new LinkedHashMap<Token, Integer>();
...@@ -46,7 +47,7 @@ public class Lookup { ...@@ -46,7 +47,7 @@ public class Lookup {
public Decission forId(int id) { public Decission forId(int id) {
if (id < 0 || id > getTotalCounts()) { if (id < 0 || id > getTotalCounts()) {
throw new IllegalArgumentException("expected id [0; totalCount)"); throw new IllegalArgumentException(String.format("got %d but expected id [0; %d)",id, getTotalCounts()));
} }
int i = 0; int i = 0;
for (Entry<Token, Integer> entry : tokens.entrySet()) { for (Entry<Token, Integer> entry : tokens.entrySet()) {
......
package markov; package markov;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.ObjectInputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.util.Arrays; import java.util.Base64;
import java.util.Collection; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
public class Mail { public class Mail {
public static void main(String[] args) { public static void main(String[] args) throws IOException, ClassNotFoundException {
int prefixLength = 3; int prefixLength = 4;
System.out.println("reading map");
Map<Integer, Data> dataMap = (Map<Integer, Data>) new ObjectInputStream(Files.newInputStream(new File("C:\\Users\\admin\\git\\markovdisplay\\target\\classes\\data").toPath())).readObject();
System.out.println("read map");
Builder b = new Builder(prefixLength, dataMap.get(prefixLength));
Renderer r = new Renderer(Renderer.Options.FULL);
Renderer r1 = new Renderer(Renderer.Options.NONE);
Shortener shortener = new Shortener(dataMap.get(prefixLength));
CountMap<Integer> counter = new CountMap<>();
System.out.println("creating...");
final int max = 100_000;
for (int i = 0; i < 100_000; i++) {
if (i % (max / 10) == 0) {
System.out.println(i / (max / 100) + "%");
}
Sentence sentence = b.random();
String id = shortener.getId(sentence);
shortener.toInts(id).forEach(c -> counter.count(c));
}
System.out.println("distinct: " + counter.map.size());
System.out.println(counter.result().limit(100).map(e -> e.getValue().get() + "*" + e.getKey()).collect(Collectors.joining("\n")));
System.out.println("done");
}
private static class CountMap<T> {
static class MutableInt {
int value = 1;
Parser parser = new Parser(); public MutableInt(int value) {
File parent = new File("C:\\Users\\admin\\Desktop\\emails"); this.value = value;
Stream<String> mails = Arrays.stream(parent.listFiles()).limit(500).map(file -> file.toPath()).map(path -> { }
try {
MimeMessage m = new MimeMessage(Session.getDefaultInstance(new Properties()), Files.newInputStream(path)); public void increment() {
//System.out.println(m.getContentType()); ++value;
// return m.getContent().toString();
return m;
} catch (IOException | MessagingException e) {
throw new RuntimeException(e);
} }
}).filter(mimeMessage -> {
try { public int get() {
return mimeMessage.getContentType().contains("text/plain"); return value;
} catch (MessagingException e) {
throw new RuntimeException(e);
} }
}).map(m -> {
try {
return m.getContent().toString();
} catch (IOException | MessagingException e) {
throw new RuntimeException(e);
} }
});
Data data = Utils.parse(mails, prefixLength);
Builder b = new Builder(prefixLength, data); private final Map<T, MutableInt> map = new HashMap<>();
Renderer r = new Renderer(data, Renderer.Options.FULL);
Renderer r1 = new Renderer(data, Renderer.Options.NONE);
Shortener shortener = new Shortener(data);
for (int i = 0; i < 10; i++) { public void count(T t) {
Collection<Map.Entry<Prefix, Decission>> sentence = b.random(); MutableInt i = map.get(t);
// System.out.println(r1.render(sentence)); if (i == null) {
String id = shortener.getId(sentence); map.put(t, new MutableInt(1));
System.out.println(id); } else {
// System.out.println(r.render(shortener.getSentence(id))); i.increment();
}
}
public Stream<Map.Entry<T, MutableInt>> result() {
return map.entrySet().stream().sorted((e1, e2) -> Integer.compare(e2.getValue().get(), e1.getValue().get()));
} }
// Collection<Map.Entry<Prefix, Token>> a = b.average();
// System.out.println("average: " + r.render(a));
// for (Map.Entry<Prefix, Token> e : a) {
// System.out.println(e.getKey() + " -> " + data.fetch(e.getKey()));
// }
// System.out.println(data.dumpStats());
} }
} }
package markov; package markov;
import java.util.Collection;
import java.util.Map;
import java.util.stream.Stream; import java.util.stream.Stream;
public class Main { public class Main {
...@@ -13,9 +11,9 @@ public class Main { ...@@ -13,9 +11,9 @@ public class Main {
Data data = Utils.parse(Stream.of(input.replace("Kai☺UWE empfiehlt ", "").split("\n")), prefixLength); Data data = Utils.parse(Stream.of(input.replace("Kai☺UWE empfiehlt ", "").split("\n")), prefixLength);
Builder b = new Builder(prefixLength, data); Builder b = new Builder(prefixLength, data);
Renderer r = new Renderer(data, Renderer.Options.NONE); Renderer r = new Renderer(Renderer.Options.NONE);
Renderer r2 = new Renderer(data, Renderer.Options.FULL); Renderer r2 = new Renderer(Renderer.Options.FULL);
Collection<Map.Entry<Prefix, Decission>> sentence = b.random(); Sentence sentence = b.random();
for (int i = 0; i < 1; i++) { for (int i = 0; i < 1; i++) {
System.out.println(r2.render(sentence)); System.out.println(r2.render(sentence));
} }
......
...@@ -10,7 +10,7 @@ public class Parser { ...@@ -10,7 +10,7 @@ public class Parser {
public Map<Integer, Data> parse(Stream<Stream<Token>> input, Collection<Integer> prefixes) { public Map<Integer, Data> parse(Stream<Stream<Token>> input, Iterable<Integer> prefixes) {
Collection<Collector> collectors = new ArrayList<>(); Collection<Collector> collectors = new ArrayList<>();
for (Integer i : prefixes) { for (Integer i : prefixes) {
collectors.add(new Collector(i)); collectors.add(new Collector(i));
......
package markov; package markov;
import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
// FIXME rename // FIXME rename
public class Prefix { public class Prefix implements Serializable{
private final List<Token> tokens; private final List<Token> tokens;
public Prefix(Token[] tokens) { public Prefix(Token[] tokens) {
......
package markov; package markov;
import java.util.Collection;
import java.util.Map.Entry;
public class Renderer { public class Renderer {
public static class Options { public static class Options {
private final boolean propability; private final boolean propability;
...@@ -25,32 +22,25 @@ public class Renderer { ...@@ -25,32 +22,25 @@ public class Renderer {
} }
private final Data data;
private final Options options; private final Options options;
public Renderer(Data data) { public Renderer() {
super(); super();
this.data = data;
this.options = Options.NONE; this.options = Options.NONE;
} }
public Renderer(Data data, Options options) { public Renderer(Options options) {
super(); super();
this.data = data;
this.options = options; this.options = options;
} }
public String render(Collection<Entry<Prefix, Decission>> sentence) { public String render(Sentence sentence) {
double p = 1;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (options.recomends) if (options.recomends)
sb.append("KAI-uwe empfiehlt"); sb.append("KAI-uwe empfiehlt");
for (Entry<Prefix, Decission> t : sentence) { for (Decission d : sentence) {
Lookup lookup = data.fetch(t.getKey()); Lookup lookup = d.getLookup();
int possibilities = lookup.getDistinctTokens(); int possibilities = lookup.getDistinctTokens();
p *= (double) lookup.getAmount(t.getValue().getToken())
/ (double) lookup.getTotalCounts();
if (options.possibilities) { if (options.possibilities) {
if (possibilities <= 1) { if (possibilities <= 1) {
...@@ -75,14 +65,16 @@ public class Renderer { ...@@ -75,14 +65,16 @@ public class Renderer {
sb.append(" "); sb.append(" ");
} }
} }
sb.append(t.getValue().getToken().render(options.prefix)); if ((Token.START .equals(d.getToken())|| Token.END.equals(d.getToken()) ) && !options.specialToken) {
continue;
}
sb.append(d.getToken().render(options.prefix));
} }
if (options.propability) { if (options.propability) {
sb.append(" - " + p); sb.append(" - " + sentence.propability());
} }
return sb.toString(); return sb.toString();
} }
} }
package markov;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.stream.Stream;
public class Sentence implements Iterable<Decission>, Serializable {
private final Collection<Decission> decissions = new ArrayList<>();
private final Data data;
public Sentence(Data data) {
this.data = data;
}
public Data getData() {
return data;
}
public void add(Decission decission) {
this.decissions.add(decission);
}
public Stream<Decission> asStream() {
return decissions.stream();
}
public double propability() {
return asStream()
.mapToDouble(d -> (double) d.getLookup().getAmount(d.getToken())
/ (double) d.getLookup().getTotalCounts())
.reduce((d1, d2) -> d1 * d2).getAsDouble();
}
public String render() {
return new Renderer().render(this);
}
public String id() {
return new Shortener(data).getId(this);
}
@Override
public Iterator<Decission> iterator() {
return decissions.iterator();
}
}
package markov; package markov;
import java.nio.charset.Charset;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Base64; import java.util.Base64;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
...@@ -18,17 +12,19 @@ public class Shortener { ...@@ -18,17 +12,19 @@ public class Shortener {
this.data = data; this.data = data;
} }
public String getId(Collection<Map.Entry<Prefix, Decission>> sentence) { public String getId(Sentence sentence) {
return fromInts(sentence.stream() return fromInts(sentence.asStream()
.map(d -> d.getValue().getId())); .flatMap(d -> {
if (d.getLookup().getDistinctTokens() == 1) {
return Stream.empty();
} else
return Stream.of(d.getId());
}));
} }
String fromInts(Stream<Integer> ints) { String fromInts(Stream<Integer> ints) {
String cps = ints.map(i -> { String cps = ints.map(i -> new String(Character.toChars(i))).collect(Collectors.joining());
System.out.println(i);
return new String(Character.toChars(i));
}).collect(Collectors.joining());
return Base64.getEncoder().encodeToString(cps.getBytes()); return Base64.getEncoder().encodeToString(cps.getBytes());
} }
...@@ -36,15 +32,18 @@ public class Shortener { ...@@ -36,15 +32,18 @@ public class Shortener {
return new String(Base64.getDecoder().decode(hash.getBytes())).codePoints().boxed(); return new String(Base64.getDecoder().decode(hash.getBytes())).codePoints().boxed();
} }
public Collection<Map.Entry<Prefix, Decission>> getSentence(String hash) { public Sentence getSentence(String hash) {
Integer[] ids = toInts(hash).toArray(Integer[]::new); Integer[] ids = toInts(hash).toArray(Integer[]::new);
Collection<Map.Entry<Prefix, Decission>> sentence = new ArrayList<>(); Sentence sentence = new Sentence(data);
Prefix prefix = new Prefix(new Token[0]); Prefix prefix = new Prefix(new Token[0]);
for (int id : ids) { int i = 0;
while (i < ids.length) {
Lookup lookup = data.fetch(prefix); Lookup lookup = data.fetch(prefix);
Decission d = lookup.forId(id); Decission d = lookup.getDistinctTokens() == 1 ? lookup.average() : lookup.forId(ids[i++]);
sentence.add(new AbstractMap.SimpleEntry<>(prefix, d)); d.setP(prefix);
d.setLookup(lookup);
sentence.add(d);
prefix = prefix.slide(d.getToken(), data.getPrefixLength()); prefix = prefix.slide(d.getToken(), data.getPrefixLength());
} }
return sentence; return sentence;
......
package markov; package markov;
public class Token { import java.io.Serializable;
public static final Token START = new SpecialToken(); public class Token implements Serializable {
public static final Token END = new SpecialToken();
protected static final Token EMPTY = new SpecialToken(); public static final Token START = new SpecialToken("START");
public static final Token END = new SpecialToken("END");
protected static final Token EMPTY = new SpecialToken("EMPTY");
public Token(String content) { public Token(String content) {
this(content, Glyph.Type.word); this(content, Glyph.Type.word);
} }
public Token(String content, Glyph.Type type) { public Token(String content, Glyph.Type type) {
super(); super();
this.content = content; this.content = content;
...@@ -19,14 +22,15 @@ public class Token { ...@@ -19,14 +22,15 @@ public class Token {
@Override @Override
public String toString() { public String toString() {
if (this == END) { if (this.equals(END)) {
return "Token <END>"; return "Token <END>";
} }
if (this == START) { if (this.equals(START)) {
return "TOKEN <Start>"; return "TOKEN <Start>";
} }
if (this == EMPTY) if (this.equals(EMPTY)) {
return "TOKEN <empty>"; return "TOKEN <empty>";
}
return "T=[" + content + "]"; return "T=[" + content + "]";
} }
...@@ -54,8 +58,8 @@ public class Token { ...@@ -54,8 +58,8 @@ public class Token {
private static class SpecialToken extends Token { private static class SpecialToken extends Token {
public SpecialToken() { public SpecialToken(String s) {
super("", Glyph.Type.control); super(s, Glyph.Type.control);
} }
@Override @Override
......
package markov; package markov;
import java.nio.ByteBuffer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.stream.Stream; import java.util.stream.Stream;
public class Utils { public class Utils {
public static byte[] toByteArray(int value) {
return ByteBuffer.allocate(4).putInt(value).array();
}
public static int fromByteArray(byte[] bytes) {
return ByteBuffer.wrap(bytes).getInt();
}
public static Data parse(Stream<String> input, int prefixLength) { public static Data parse(Stream<String> input, int prefixLength) {
Collection<Integer> collectionDummy = new ArrayList<>(); Collection<Integer> collectionDummy = new ArrayList<>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment