Commit e42c4630 by Hut

Merge branch 'master' of ssh://git.breab.org:2223/patrick/markov

# Conflicts: # .gitlab-ci.yml
parent c907c0b0
......@@ -6,7 +6,8 @@
<groupId>passive.directory</groupId>
<artifactId>markov</artifactId>
<version>0.2</version>
<version>0.4</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
......@@ -16,6 +17,18 @@
</properties>
<dependencies>
<dependency>
<groupId>com.tomgibara.bits</groupId>
<artifactId>bits</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>24.0-jre</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
......@@ -69,4 +82,37 @@
</plugins>
</build>
<profiles>
<profile>
<id>one</id>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>markov.Mail</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>
\ No newline at end of file
......@@ -9,9 +9,9 @@ import java.util.stream.Collectors;
public class Lookup implements Serializable {
private final LinkedHashMap<Token, Integer> tokens = new LinkedHashMap<Token, Integer>();
private int totalCounts =0;
int getTotalCounts() {
return tokens.values().stream().mapToInt(i -> i.intValue()).sum();
return totalCounts;
}
int getDistinctTokens() {
......@@ -25,6 +25,7 @@ public class Lookup implements Serializable {
public void add(Token t) {
Integer i = tokens.getOrDefault(t, 0);
tokens.put(t, i + 1);
totalCounts++;
}
public Decission forRandom(double random) {
......
package markov;
import markov.stuff.Utils;
import java.util.stream.Stream;
public class Main {
......@@ -17,7 +19,7 @@ public class Main {
for (int i = 0; i < 1; i++) {
System.out.println(r2.render(sentence));
}
Shortener shortener = new Shortener(data);
ShortenerByteImpl shortener = new ShortenerByteImpl(data);
String id = shortener.getId(sentence);
System.out.println(id);
System.out.println(r.render(shortener.getSentence(id)));
......
......@@ -3,74 +3,74 @@ package markov;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
// FIXME rename
public class Prefix implements Serializable{
private final List<Token> tokens;
public class Prefix implements Serializable, Comparable<Prefix> {
private final Token[] tokens;
public Prefix(Token[] tokens) {
this.tokens = compressLeadingStarts(new ArrayList<>(Arrays.asList(tokens)));
this.tokens = tokens;
this.hashCode = generateHashCode();
}
public Prefix(List<Token> tokens) {
this.tokens = compressLeadingStarts(new ArrayList<>(tokens));
this(tokens.toArray(new Token[tokens.size()]));
}
public Prefix slide(Token newToken) {
return slide(newToken, this.tokens.size());
return slide(newToken, this.tokens.length);
}
public Prefix slide(Token newToken, int maxLength) {
List<Token> newTokens = new ArrayList<>(this.tokens);
List<Token> newTokens = new ArrayList<Token>(Arrays.asList(this.tokens.clone()));
newTokens.add(newToken);
while(newTokens.size() > maxLength) {
while (newTokens.size() > maxLength) {
newTokens.remove(0);
}
return new Prefix(newTokens);
}
private List<Token> compressLeadingStarts(List<Token> tokens) {
return tokens;
// if (tokens.size() > 1 && tokens.get(0) == Token.START) {
// Iterator<Token> iterator = tokens.iterator();
// for (Token t = iterator.next(); iterator.hasNext(); t = iterator.next()) {
// if (t == Token.START) {
// iterator.remove();
// } else {
// break;
// }
// }
// }
// return tokens;
}
@Override
public String toString() {
return "P=" + tokens;
return "Prefix{" +
"tokens=" + Arrays.toString(tokens) +
'}';
}
private final int hashCode;
private int generateHashCode() {
return Arrays.hashCode(tokens);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + tokens.hashCode();
return result;
return hashCode;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Prefix other = (Prefix) obj;
if (!tokens.equals(other.tokens))
return false;
return true;
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Prefix prefix = (Prefix) o;
return Arrays.equals(tokens, prefix.tokens);
}
@Override
public int compareTo(Prefix o) {
if(this.tokens.length != o.tokens.length) {
return Integer.compare(o.tokens.length, this.tokens.length);
}
for (int i = 0; i < this.tokens.length; i++) {
int c = this.tokens[i].compareTo(o.tokens[i]);
if(c != 0) {
return c;
}
}
return 0;
}
}
......@@ -38,7 +38,7 @@ public class Sentence implements Iterable<Decission>, Serializable {
}
public String id() {
return new Shortener(data).getId(this);
return new ShortenerByteImpl(data).getId(this);
}
@Override
public Iterator<Decission> iterator() {
......
package markov;
import java.util.Base64;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public interface Shortener {
String getId(Sentence sentence);
public class Shortener {
private final Data data;
public Shortener(Data data) {
this.data = data;
}
public String getId(Sentence sentence) {
return fromInts(sentence.asStream()
.flatMap(d -> {
if (d.getLookup().getDistinctTokens() == 1) {
return Stream.empty();
} else
return Stream.of(d.getId());
}));
}
String fromInts(Stream<Integer> ints) {
String cps = ints.map(i -> new String(Character.toChars(i))).collect(Collectors.joining());
return Base64.getEncoder().encodeToString(cps.getBytes());
}
Stream<Integer> toInts(String hash) {
return new String(Base64.getDecoder().decode(hash.getBytes())).codePoints().boxed();
}
public Sentence getSentence(String hash) {
Integer[] ids = toInts(hash).toArray(Integer[]::new);
Sentence sentence = new Sentence(data);
Prefix prefix = new Prefix(new Token[0]);
int i = 0;
while (i < ids.length) {
Lookup lookup = data.fetch(prefix);
Decission d = lookup.getDistinctTokens() == 1 ? lookup.average() : lookup.forId(ids[i++]);
d.setP(prefix);
d.setLookup(lookup);
sentence.add(d);
prefix = prefix.slide(d.getToken(), data.getPrefixLength());
}
return sentence;
}
Sentence getSentence(String hash);
}
package markov;
import markov.stuff.Utils;
import java.nio.ByteBuffer;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerByte2Impl extends ShortenerSimpleImpl {
public ShortenerByte2Impl(Data data) {
super(data);
}
String fromInts(Stream<Integer> ints) {
List<Byte> bl = ints.flatMap(i -> {
Stream.Builder<Byte> builder = Stream.<Byte>builder();
if (i < 0b00111111_11111111) {
byte[] bytes = Utils.toByteArray(i);
builder.add((byte) (bytes[2] | 0b01000000));
builder.add(bytes[3]);
} else {
for (byte b : Utils.toByteArray(i)) {
builder.add(b);
}
}
return builder.build();
}).collect(Collectors.toList());
ByteBuffer bb = ByteBuffer.allocate(bl.size());
for (Byte b : bl) {
bb.put(b);
}
bb.rewind();
return Base64.getEncoder().encodeToString(bb.array());
}
}
package markov;
import markov.stuff.Utils;
import java.nio.ByteBuffer;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerByte64Impl extends ShortenerSimpleImpl {
public ShortenerByte64Impl(Data data) {
super(data);
}
String fromInts(Stream<Integer> ints) {
List<Byte> bl = ints.flatMap(i -> {
if (i < 64) {
return Stream.of((byte) (i | 0b01000000));
} else {
Stream.Builder<Byte> builder = Stream.<Byte>builder();
for (byte b : Utils.toByteArray(i)) {
builder.add(b);
}
return builder.build();
}
}).collect(Collectors.toList());
ByteBuffer bb = ByteBuffer.allocate(bl.size());
for (Byte b : bl) {
bb.put(b);
}
bb.rewind();
return Base64.getEncoder().encodeToString(bb.array());
}
}
package markov;
import markov.stuff.CountMap;
import markov.stuff.SimpleCountMap;
import markov.stuff.Utils;
import java.nio.ByteBuffer;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerByteHUffmanTrainerImpl extends ShortenerSimpleImpl {
private final CountMap<Integer> intsMap = new SimpleCountMap<>();
private final CountMap<Byte> byteMap = new SimpleCountMap<>();
public CountMap<Integer> getIntsMap() {
return intsMap;
}
public CountMap<Byte> getByteMap() {
return byteMap;
}
public ShortenerByteHUffmanTrainerImpl(Data data) {
super(data);
String prefix = "init trainer for " + data.getPrefixLength() + ": ";
// for (int i = 0; i < Integer.MAX_VALUE; i++) {
// intsMap.count(i);
// Utils.maybePrintPercentages(prefix, i, Integer.MAX_VALUE);
// }
// for (int i = 0; i < Byte.MAX_VALUE; i++) {
// byteMap.count((byte) i);
// }
}
String fromInts(Stream<Integer> ints) {
List<Byte> bl = ints.flatMap(i -> {
intsMap.count(i);
if (i < 128) {
return Stream.of((byte) (i | 0b10000000));
} else {
Stream.Builder<Byte> builder = Stream.<Byte>builder();
for (byte b : Utils.toByteArray(i)) {
builder.add(b);
}
return builder.build();
}
}).collect(Collectors.toList());
ByteBuffer bb = ByteBuffer.allocate(bl.size());
for (Byte b : bl) {
bb.put(b);
byteMap.count(b);
}
bb.rewind();
return Base64.getEncoder().encodeToString(bb.array());
}
}
\ No newline at end of file
package markov;
import markov.huffman.HuffmanCode;
import markov.stuff.BitConverter;
import markov.stuff.SimpleCountMap;
import markov.stuff.Utils;
import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerByteHuffmanImpl extends ShortenerSimpleImpl {
private final HuffmanCode<Byte, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode;
public ShortenerByteHuffmanImpl(
Data data,
HuffmanCode<Byte, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode) {
super(data);
this.byteCode = byteCode;
}
String fromInts(Stream<Integer> ints) {
List<Boolean> bl = ints.flatMap(i -> {
if (i < 128) {
return Stream.of((byte) (i | 0b10000000));
} else {
Stream.Builder<Byte> builder = Stream.<Byte>builder();
for (byte b : Utils.toByteArray(i)) {
builder.add(b);
}
return builder.build();
}
}).flatMap(b -> byteCode.encode(b).stream()).collect(Collectors.toList());
BitConverter bc = new BitConverter();
return Base64.getEncoder().encodeToString(bc.toBytes(bl));
}
@Override
Stream<Integer> toInts(String hash) {
Stream.Builder<Integer> builder = Stream.builder();
List<Byte> decoded = byteCode.decode(
new BitConverter().toBits(Base64.getDecoder().decode(hash)));
Iterator<Byte> iter = decoded.iterator();
while (iter.hasNext()) {
byte cur = iter.next();
if (cur < 128) {
builder.add((int) cur);
} else {
byte[] tmp = new byte[4];
tmp[0] = cur;
tmp[1] = iter.next();
tmp[2] = iter.next();
tmp[3] = iter.next();
builder.add(Utils.fromByteArray(tmp));
}
}
return builder.build();
}
}
\ No newline at end of file
package markov;
import markov.stuff.Utils;
import java.nio.ByteBuffer;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerByteImpl extends ShortenerSimpleImpl {
public ShortenerByteImpl(Data data) {
super(data);
}
String fromInts(Stream<Integer> ints) {
List<Byte> bl = ints.flatMap(i -> {
if (i < 128) {
return Stream.of((byte) (i | 0b10000000));
} else {
Stream.Builder<Byte> builder = Stream.<Byte>builder();
for (byte b : Utils.toByteArray(i)) {
builder.add(b);
}
return builder.build();
}
}).collect(Collectors.toList());
ByteBuffer bb = ByteBuffer.allocate(bl.size());
for (Byte b : bl) {
bb.put(b);
}
bb.rewind();
return Base64.getUrlEncoder().encodeToString(bb.array());
}
@Override
Stream<Integer> toInts(String hash) {
byte[] hashBytes = Base64.getUrlDecoder().decode(hash);
Stream.Builder<Integer> builder = Stream.builder();
for (int i = 0; i < hashBytes.length; ) {
if (hashBytes[i] < 128) {
builder.add((int) hashBytes[i++]);
} else {
byte[] newBytes = new byte[4];
System.arraycopy(hashBytes, i, newBytes, 0, 4);
builder.add(Utils.fromByteArray(newBytes));
i += 4;
}
}
return builder.build();
}
}
package markov;
import markov.huffman.HuffmanCode;
import markov.stuff.BitConverter;
import markov.stuff.SimpleCountMap;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerIntHuffmanImpl extends ShortenerSimpleImpl {
private final HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> intCode;
public ShortenerIntHuffmanImpl(
Data data,
HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode) {
super(data);
this.intCode = byteCode;
}
String fromInts(Stream<Integer> ints) {
List<Boolean> bl = ints.flatMap(i -> intCode.encode(i).stream()).collect(
Collectors.toList());
BitConverter bc = new BitConverter();
return Base64.getUrlEncoder().encodeToString(bc.toBytes(bl));
}
@Override
Stream<Integer> toInts(String hash) {
byte[] decoded = Base64.getUrlDecoder().decode(hash);
return intCode.decode(new BitConverter().toBits(decoded)).stream();
}
}
\ No newline at end of file
package markov;
import java.nio.ByteBuffer;
import java.util.Base64;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerSimpleImpl implements Shortener {
private final Data data;
public ShortenerSimpleImpl(Data data) {
this.data = data;
}
@Override
public String getId(Sentence sentence) {
return fromInts(sentence.asStream()
.flatMap(d -> {
if (d.getLookup().getDistinctTokens() == 1) {
return Stream.empty();
} else
return Stream.of(d.getId());
}));
}
String fromInts(Stream<Integer> ints) {
List<Integer> b1 = ints.collect(Collectors.toList());
ByteBuffer bb = ByteBuffer.allocate(b1.size()*4);
for(int b : b1) {
bb.putInt(b);
}
bb.rewind();
return Base64.getEncoder().encodeToString(bb.array());
}
Stream<Integer> toInts(String hash) {
return new String(Base64.getDecoder().decode(hash.getBytes())).codePoints().boxed();
}
@Override
public Sentence getSentence(String hash) {
Integer[] ids = toInts(hash).toArray(Integer[]::new);
Sentence sentence = new Sentence(getData());
Prefix prefix = new Prefix(new Token[0]);
int i = 0;
while (i < ids.length) {
Lookup lookup = getData().fetch(prefix);
Decission d = lookup.getDistinctTokens() == 1 ? lookup.average() : lookup.forId(ids[i++]);
d.setP(prefix);
d.setLookup(lookup);
sentence.add(d);
prefix = prefix.slide(d.getToken(), getData().getPrefixLength());
}
return sentence;
}
protected Data getData() {
return data;
}
}
package markov;
import java.util.Base64;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ShortenerStringImpl extends ShortenerSimpleImpl {
public ShortenerStringImpl(Data data) {
super(data);
}
String fromInts(Stream<Integer> ints) {
String cps = ints.map(i -> new String(Character.toChars(i))).collect(Collectors.joining());
return Base64.getEncoder().encodeToString(cps.getBytes());
}
}
......@@ -2,7 +2,7 @@ package markov;
import java.io.Serializable;
public class Token implements Serializable {
public class Token implements Serializable, Comparable<Token> {
public static final Token START = new SpecialToken("START");
public static final Token END = new SpecialToken("END");
......@@ -15,6 +15,7 @@ public class Token implements Serializable {
public Token(String content, Glyph.Type type) {
super();
this.content = content;
this.hashCode = generateHashCode();
}
private String content;
......@@ -45,17 +46,28 @@ public class Token implements Serializable {
return type == token.type;
}
@Override
public int hashCode() {
private final int hashCode;
private int generateHashCode() {
int result = content != null ? content.hashCode() : 0;
result = 31 * result + (type != null ? type.hashCode() : 0);
return result;
}
@Override
public int hashCode() {
return hashCode;
}
public String render(String prefix) {
return prefix + content;
}
@Override
public int compareTo(Token o) {
return o.content.compareTo(this.content);
}
private static class SpecialToken extends Token {
public SpecialToken(String s) {
......
package markov;
import java.util.ArrayList;
import java.util.Collection;
import java.util.stream.Stream;
public class Utils {
public static Data parse(Stream<String> input, int prefixLength) {
Collection<Integer> collectionDummy = new ArrayList<>();
collectionDummy.add(prefixLength);
return new Parser().parse(new Tokenizer().tokenize(input), collectionDummy).values()
.stream().findFirst().get();
}
}
package markov.huffman;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Supplier;
public class ByteHuffmanCodeBuilder<ContentType, FrequencyType extends FrequenceType<FrequencyType>> extends HuffmanCodeBuilder<ContentType, List<Boolean>, Boolean, FrequencyType> {
private static final Supplier<List<Boolean>> rootCodeSupplier = () -> new ArrayList<>();
private static final Supplier<Boolean> leftGlyph = () -> false;
private static final Supplier<Boolean> rightGlyph = () -> true;
private static final BiFunction<List<Boolean>, Boolean, List<Boolean>> combiner = (l, g) -> {
l = new ArrayList<>(l);
l.add(g);
return l;
};
private static final Function<List<Boolean>, Iterator<Boolean>> splitter = l -> l.iterator();
public ByteHuffmanCodeBuilder() {
super(rootCodeSupplier, leftGlyph, rightGlyph, combiner, splitter, (n, b) -> b ? n.getRight() : n.getLeft());
}
}
package markov.huffman;
public interface FrequenceType<X extends FrequenceType> extends Comparable<X> {
X add(X b);
boolean isGreaterZero();
}
package markov.huffman;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import java.util.function.Function;
public class HuffmanCode<ContentType, SequenceType, GlyphType, FrequencyType extends FrequenceType<FrequencyType>> implements Serializable {
private final Map<ContentType, SequenceType> codes;
private final HuffmanTree<ContentType, SequenceType, FrequencyType> tree;
private final Function<SequenceType, Iterator<GlyphType>> splitter;
private final BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider;
public HuffmanCode(Map<ContentType, SequenceType> codes,
HuffmanTree<ContentType, SequenceType, FrequencyType> tree,
Function<SequenceType, Iterator<GlyphType>> splitter,
BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType,
HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider) {
this.codes = codes;
this.tree = tree;
this.splitter = splitter;
this.leftRightDecider = leftRightDecider;
}
public SequenceType encode(ContentType t) {
return codes.get(t);
}
public List<ContentType> decode(SequenceType v) {
List<ContentType> ret = new ArrayList<>();
Iterator<GlyphType> iterator = splitter.apply(v);
HuffmanTree<ContentType, SequenceType, FrequencyType> node = tree;
while (iterator.hasNext()) {
GlyphType glyph = iterator.next();
node = leftRightDecider.apply((HuffmanNode<ContentType, SequenceType, FrequencyType>) node, glyph);
if (node instanceof HuffmanLeaf) {
ret.add(((HuffmanLeaf<ContentType, SequenceType, FrequencyType>) node).getValue());
node = tree;
}
}
return ret;
}
}
\ No newline at end of file
package markov.huffman;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Supplier;
/**
* Used to build a HUffman Code instance to code a given ContentType into a SequenceType,
* and decode a sequence of SequenceTypes into a sequence of ContentTypes,
* using the given Frequency type to determine the distribution.
*
* @param <ContentType>
* @param <SequenceType>
* @param <GlyphType>
* @param <FrequencyType>
*/
public class HuffmanCodeBuilder<ContentType, SequenceType, GlyphType, FrequencyType extends FrequenceType<FrequencyType>> {
private final Supplier<SequenceType> rootCodeSupplier;
private final Supplier<GlyphType> leftGlyph;
private final Supplier<GlyphType> rightGlyph;
private final BiFunction<SequenceType, GlyphType, SequenceType> combiner;
private final Function<SequenceType, Iterator<GlyphType>> splitter;
private final BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider;
private HuffmanNode<ContentType, SequenceType, FrequencyType> tree;
private final Map<ContentType, SequenceType> codes = new HashMap<>();
public HuffmanCodeBuilder(Supplier<SequenceType> rootCodeSupplier, Supplier<GlyphType> leftGlyph, Supplier<GlyphType> rightGlyph, BiFunction<SequenceType, GlyphType, SequenceType> combiner, Function<SequenceType, Iterator<GlyphType>> splitter, BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>, GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider) {
this.rootCodeSupplier = rootCodeSupplier;
this.leftGlyph = leftGlyph;
this.rightGlyph = rightGlyph;
this.combiner = combiner;
this.splitter = splitter;
this.leftRightDecider = leftRightDecider;
}
public HuffmanCode<ContentType, SequenceType, GlyphType, FrequencyType> generateCode(Map<ContentType, FrequencyType> frequencies) {
tree = buildTree(frequencies);
tree.setCode(rootCodeSupplier.get());
generateCodes(tree);
return new HuffmanCode<>(codes, tree, splitter, leftRightDecider);
}
private HuffmanNode<ContentType, SequenceType, FrequencyType> buildTree(Map<ContentType, FrequencyType> frequencies) {
PriorityQueue<HuffmanTree<ContentType, SequenceType, FrequencyType>> trees = new PriorityQueue<>();
for (Map.Entry<ContentType, FrequencyType> e : frequencies.entrySet()) {
if (e.getValue().isGreaterZero())
trees.offer(new HuffmanLeaf<ContentType, SequenceType, FrequencyType>(e.getValue(), e.getKey()));
}
while (trees.size() > 1) {
HuffmanTree a = trees.poll();
HuffmanTree b = trees.poll();
trees.offer(new HuffmanNode<ContentType, SequenceType, FrequencyType>(a, b));
}
return (HuffmanNode<ContentType, SequenceType, FrequencyType>) trees.poll();
}
private void generateCodes(HuffmanTree<ContentType, SequenceType, FrequencyType> child) {
if (child == null)
throw new IllegalArgumentException(new NullPointerException());
if (child instanceof HuffmanLeaf) {
HuffmanLeaf<ContentType, SequenceType, FrequencyType> leaf = (HuffmanLeaf<ContentType, SequenceType, FrequencyType>) child;
codes.put(leaf.getValue(), leaf.getCode());
} else if (child instanceof HuffmanNode) {
HuffmanNode<ContentType, SequenceType, FrequencyType> node = (HuffmanNode<ContentType, SequenceType, FrequencyType>) child;
// traverse left
HuffmanTree<ContentType, SequenceType, FrequencyType> left = node.getLeft();
SequenceType leftCode = combiner.apply(child.getCode(), leftGlyph.get());
left.setCode(leftCode);
generateCodes(left);
// traverse right
HuffmanTree<ContentType, SequenceType, FrequencyType> right = node.getRight();
SequenceType rightCode = combiner.apply(child.getCode(), rightGlyph.get());
right.setCode(rightCode);
generateCodes(right);
}
}
}
package markov.huffman;
class HuffmanLeaf<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>> extends
HuffmanTree<ContentType, SequenceType, FrequencyType> {
private final ContentType value; // the character this leaf represents
ContentType getValue() {
return value;
}
HuffmanLeaf(FrequencyType freq, ContentType val) {
super(freq);
value = val;
}
}
package markov.huffman;
class HuffmanNode<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>>
extends HuffmanTree<ContentType, SequenceType, FrequencyType> {
private final HuffmanTree<ContentType, SequenceType, FrequencyType> left;
private final HuffmanTree<ContentType, SequenceType, FrequencyType> right;
HuffmanTree<ContentType, SequenceType, FrequencyType> getLeft() {
return left;
}
HuffmanTree<ContentType, SequenceType, FrequencyType> getRight() {
return right;
}
HuffmanNode(HuffmanTree<ContentType, SequenceType, FrequencyType> l,
HuffmanTree<ContentType, SequenceType, FrequencyType> r) {
super(l.getFrequency().add(r.getFrequency()));
left = l;
right = r;
}
}
package markov.huffman;
import markov.stuff.CountMap;
import java.io.Serializable;
import java.util.Collections;
import java.util.Map;
public class HuffmanStore<Subtype extends Serializable> implements Serializable {
private static final long serialVersionUID = 21475544004070999L;
private final Map<Integer, CountMap<Subtype>> internal;
public HuffmanStore(Map<Integer, CountMap<Subtype>> internal) {
this.internal = internal;
}
public CountMap<Subtype> get(int i) {
return internal.get(i);
}
public Map<Integer, CountMap<Subtype>> getInternal() {
return Collections.unmodifiableMap(internal);
}
}
package markov.huffman;
abstract class HuffmanTree<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>>
implements Comparable<HuffmanTree<ContentType, SequenceType, FrequencyType>> {
private final FrequencyType frequency;
private SequenceType code = null;
HuffmanTree(FrequencyType freq) {
frequency = freq;
}
FrequencyType getFrequency() {
return frequency;
}
public SequenceType getCode() {
return code;
}
public void setCode(SequenceType code) {
this.code = code;
}
@Override
public int compareTo(HuffmanTree<ContentType, SequenceType, FrequencyType> o) {
return this.frequency.compareTo(o.frequency);
}
}
package markov.stuff;
import com.tomgibara.bits.Bits;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class BitConverter {
public byte[] toBytes(List<Boolean> list) {
int listLength = list.size();
int lengthInBit = listLength + 3;
int arrayLength = Math.max(1, (int) Math.ceil(lengthInBit / 8d));
byte[] bytes = new byte[arrayLength];
int lastByteNotEmpty = lengthInBit % 8;
// temp variable to not constantly write to heap
byte tmpByte = ((byte) lastByteNotEmpty);
int i = 0;
// set 3 bits for the lastByteNotEmpty number
for (int j = 3; j < 8 && i < listLength; j++, i++) {
tmpByte = setBit(list, tmpByte, i, j);
}
bytes[0] = tmpByte;
// set rest of the bits
for (int j = 1; j < arrayLength; j++) {
tmpByte = 0;
for (int l = 0; l < 8 && i < listLength; l++, i++) {
tmpByte = setBit(list, tmpByte, i, l);
}
bytes[j] = tmpByte;
}
return bytes;
}
private byte setBit(List<Boolean> list, byte tmpByte, int i, int j) {
tmpByte |= (byte) (list.get(i) ? 1 : 0) << j;
return tmpByte;
}
public List<Boolean> toBits(byte[] array) {
int arrayLength = array.length;
if (arrayLength == 0) {
return Collections.emptyList();
}
// determine lastByteNotEmpty bits
byte tmpByte = array[0];
int lastByteNotEmpty = tmpByte & 0b00000111;
int listSize = 8 * arrayLength + (lastByteNotEmpty == 0 ? -3 : lastByteNotEmpty - 11);
if (arrayLength == 1 && lastByteNotEmpty < 4 && lastByteNotEmpty != 0) {
throw new IllegalArgumentException("corrupted data");
}
// List<Boolean> list = toListBits(array, arrayLength, listSize);
List<Boolean> list = toListPrimitiv(listSize, array);
return list;
}
private List<Boolean> toListBits(byte[] array, int arrayLength, int listSize) {
List<Boolean> list = new ArrayList<>(Bits.asStore(array).asList());
// // remove lastByteNotEmpty bits
list.remove(0);
list.remove(0);
list.remove(0);
int maxIndex = arrayLength * 8 - 4; // BitStore uses all 8 bits per byte
for (int i = maxIndex; i >= listSize; i--) {
list.remove(i);
}
return list;
}
private static List<Boolean> toListPrimitiv(int listSize, byte[] array) {
List<Boolean> list = new ArrayList<>(listSize);
for (int i = 3; i < listSize + 3; i++) {
list.add((array[i / 8] & 1 << (i % 8)) >> i % 8 == 1);
}
return list;
}
}
package markov.stuff;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Stream;
public interface CountMap<T> {
void count(T t);
Stream<Map.Entry<T, SimpleCountMap.MutableInt>> result();
HashMap<T, SimpleCountMap.MutableInt> asMap();
}
package markov.stuff;
import markov.Data;
import markov.Lookup;
import markov.Prefix;
import markov.Token;
import java.lang.reflect.Field;
import java.util.HashMap;
import java.util.Map;
public class Inspector {
public static void printDataHashStats(Map<Integer, Data> dataMaps) throws NoSuchFieldException, IllegalAccessException {
for (Map.Entry<Integer, Data> e : dataMaps.entrySet()) {
System.out.println(String.format("map for prefixLength: %d:", e.getKey()));
System.out.println("Hash clash print: ");
Map<Integer, Integer> hashClashes =
MapClashInspector.getHashClashDistribution(extractMap(e.getValue()));
printClashes(hashClashes);
System.out.println("Bucket entry clash print: ");
Map<Integer, Integer> bucketClashes =
MapClashInspector.getBucketClashDistribution(extractMap(e.getValue()));
printClashes(bucketClashes);
}
}
public static HashMap<Prefix, Lookup> extractMap(Data data) throws NoSuchFieldException, IllegalAccessException {
Field nextNodeField = data.getClass().getDeclaredField("data");
nextNodeField.setAccessible(true);
return (HashMap) nextNodeField.get(data);
}
private static void printClashes(
Map<Integer, Integer> clashes) {
for (Map.Entry<Integer, Integer> e : clashes.entrySet()) {
System.out.println(e.getKey() + ": " + e.getValue());
}
}
}
package markov.stuff;
import java.lang.reflect.*;
import java.util.*;
/**
* taken from <a href="https://www.javaspecialists.eu/archive/Issue235.html">jave newsletter</a>
*/
public class MapClashInspector {
private interface MapProcessor {
void beginBucket();
void process(Map.Entry<?, ?> node);
void endBucket(Map<Integer, Integer> count);
}
/**
* Returns a map showing as key the number of clashes and
* as value the number of entries with identical hash codes.
* With a "perfect" hash function, the map will contain only
* one entry with 1 as a key and the number of entries in the
* map as a value.
*/
public static Map<Integer, Integer> getHashClashDistribution(
Map<?, ?> map)
throws NoSuchFieldException, IllegalAccessException {
return getBucketDistribution(map, new MapProcessor() {
private final Map<Integer, Integer> numberOfClashes =
new HashMap<Integer, Integer>();
public void beginBucket() {
numberOfClashes.clear();
}
public void process(Map.Entry<?, ?> node) {
increment(numberOfClashes, node.getKey().hashCode());
}
public void endBucket(Map<Integer, Integer> count) {
for (Integer val : numberOfClashes.values()) {
increment(count, val);
}
}
});
}
/**
* Returns a map showing as key the number of clashes and
* as value the number of buckets with this number of clashes.
* In a "perfect" distribution, we would have
* 1->numberOfEntriesInMap. The worst possible distribution
* is numberOfEntriesInMap->1, where all the entries go into a
* single bucket. It also shows the number of empty buckets.
* The Java 8 HashMap copes well with clashes, but earlier
* versions would become very slow due to O(n) lookup.
*/
public static Map<Integer, Integer> getBucketClashDistribution(
Map<?, ?> map)
throws NoSuchFieldException, IllegalAccessException {
return getBucketDistribution(map, new MapProcessor() {
private int size;
public void beginBucket() {
size = 0;
}
public void process(Map.Entry<?, ?> node) {
size++;
}
public void endBucket(Map<Integer, Integer> count) {
increment(count, size);
}
});
}
/**
* Increment the value if already exists; otherwise set to 1.
*/
private static void increment(
Map<Integer, Integer> map, int size) {
Integer counter = map.get(size);
if (counter == null) {
map.put(size, 1);
} else {
map.put(size, counter + 1);
}
}
private static Map<Integer, Integer> getBucketDistribution(
Map<?, ?> map, MapProcessor processor)
// Since Java 1.7, we can throw ReflectiveOperationException
throws NoSuchFieldException, IllegalAccessException {
Map.Entry<?, ?>[] table = getTable(map);
Field nextNodeField = getNextField(table);
Map<Integer, Integer> numberPerBucket =
new TreeMap<Integer, Integer>();
for (Map.Entry<?, ?> node : table) {
processor.beginBucket();
while (node != null) {
processor.process(node);
node = (Map.Entry<?, ?>) nextNodeField.get(node);
}
processor.endBucket(numberPerBucket);
}
return numberPerBucket;
}
private static Map.Entry<?, ?>[] getTable(Map<?, ?> map)
throws NoSuchFieldException, IllegalAccessException {
Field tableField = map.getClass().getDeclaredField("table");
tableField.setAccessible(true);
return (Map.Entry<?, ?>[]) tableField.get(map);
}
private static Field getNextField(Object table)
throws NoSuchFieldException {
Class<?> nodeType = table.getClass().getComponentType();
Field nextNodeField = nodeType.getDeclaredField("next");
nextNodeField.setAccessible(true);
return nextNodeField;
}
}
\ No newline at end of file
package markov.stuff;
import markov.huffman.FrequenceType;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Stream;
public class SimpleCountMap<T> extends HashMap<T, SimpleCountMap.MutableInt> implements CountMap<T> {
public static class MutableInt implements FrequenceType<MutableInt>, Serializable {
private int value = 1;
public MutableInt(int value) {
this.value = value;
}
public void increment() {
++value;
}
public int get() {
return value;
}
@Override
public MutableInt add(MutableInt b) {
return new MutableInt(get() + b.get());
}
@Override
public boolean isGreaterZero() {
return get() > 0;
}
@Override
public int compareTo(MutableInt o) {
return Integer.compare(get(), o.get());
}
}
public SimpleCountMap(int initialCapacity, float loadFactor) {
super(initialCapacity, loadFactor);
}
public SimpleCountMap(int initialCapacity) {
super(initialCapacity);
}
public SimpleCountMap() {
}
public SimpleCountMap(Map<? extends T, ? extends MutableInt> m) {
super(m);
}
@Override
public void count(T t) {
MutableInt i = get(t);
if (i == null) {
put(t, new MutableInt(1));
} else {
i.increment();
}
}
@Override
public Stream<Map.Entry<T, MutableInt>> result() {
return entrySet().stream().sorted((e1, e2) -> Integer.compare(e2.getValue().get(), e1.getValue().get()));
}
@Override
public HashMap<T, MutableInt> asMap() {
return this;
}
}
\ No newline at end of file
package markov.stuff;
import markov.Data;
import markov.Parser;
import markov.Token;
import markov.Tokenizer;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class Utils {
public static byte[] toByteArray(int value) {
return ByteBuffer.allocate(4).putInt(value).array();
}
public static int fromByteArray(byte[] bytes) {
return ByteBuffer.wrap(bytes).getInt();
}
public static boolean[] fromBitList(List<Boolean> list) {
boolean[] ret = new boolean[list.size()];
int i = 0;
for (boolean b : list) {
ret[i++] = b;
}
return ret;
}
public static Data parse(Stream<String> input, int prefixLength) {
Collection<Integer> collectionDummy = new ArrayList<>();
collectionDummy.add(prefixLength);
return new Parser().parse(new Tokenizer().tokenize(input), collectionDummy).values()
.stream().findFirst().get();
}
public static File getFile(String location) {
File file;
if (location.startsWith("classpath:")) {
URL url = Utils.class.getClassLoader().getResource(location.replace("classpath:", ""));
try {
file = new File(url.toURI());
} catch (URISyntaxException e) {
throw new IllegalArgumentException(e);
}
} else {
file = new File(location);
}
if (!file.exists() || !file.canRead() || !file.canWrite()) {
throw new IllegalArgumentException(String.format("could not not use file: %s", file.getName()));
}
return file;
}
public static void maybePrintPercentages(final int i, final int max) {
maybePrintPercentages("", i, max);
}
public static void maybePrintPercentages(String prefix, final int i, final int max) {
if (max > 100_000) {
if (i % (max / 100) == 0) {
System.out.println(prefix + i / (max / 100) + "%");
return;
}
}
if (max < 10) {
System.out.println(prefix + " " + i + "/" + max);
return;
}
if (max < 100) {
System.out.println(prefix + (int) ((double) i / max * 100) + "%");
return;
}
if (i % (max / 10) == 0) {
System.out.println(prefix + i / (max / 100) + "%");
}
}
public static Map<Integer, Data> deserializeDataMap() throws IOException,
ClassNotFoundException {
System.out.println("reading map");
Map<Integer, Data> dataMap = (Map<Integer, Data>) new ObjectInputStream(
Files.newInputStream(new File(
"C:\\Users\\admin\\git\\markovdisplay\\target\\classes\\data").toPath()))
.readObject();
System.out.println("read map");
return dataMap;
}
public static Map<Integer, Data> createDataMap(int maxPrefix) {
return createDataMap(maxPrefix, new File("C:\\Users\\admin\\Desktop\\emails"));
}
public static Map<Integer, Data> createDataMap(int maxPrefix, File parent) {
System.out.println("generating data map...");
Stream<String> mails = Arrays.stream(parent.listFiles()).limit(500).map(
file -> file.toPath()).map(path -> {
try {
MimeMessage m = new MimeMessage(
Session.getDefaultInstance(new Properties()), Files.newInputStream(path));
return m;
} catch (IOException | MessagingException e) {
throw new RuntimeException(e);
}
}).filter(mimeMessage -> {
try {
return mimeMessage.getContentType().contains("text/plain");
} catch (MessagingException e) {
throw new RuntimeException(e);
}
}).map(m -> {
try {
return m.getContent().toString();
} catch (IOException | MessagingException e) {
throw new RuntimeException(e);
}
});
Parser parser = new Parser();
Tokenizer tokenizer = new Tokenizer();
Stream<Stream<Token>> tokens = tokenizer.tokenize(mails);
Map<Integer, Data> ret = parser.parse(tokens, IntStream.range(1, maxPrefix + 1)
.boxed()
.collect(Collectors.toList()));
System.out.println("generated data map!");
return ret;
}
}
package markov;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class ShortenerByteHuffmanImplTest {
@Test
void random() {
}
}
\ No newline at end of file
package markov;
import org.junit.jupiter.api.Test;
import java.util.Base64;
import java.util.stream.Stream;
class ShortenerTest {
@Test
void shortenerTest() {
String hash = new Shortener(null).fromInts(Stream.of(561,0,64,0));
System.out.println(Base64.getEncoder().encodeToString("\0\0@\0\0".getBytes()));
System.out.println(hash);
System.out.println(hash.length());
}
}
\ No newline at end of file
package markov;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
public class TokenTests {
@Test
public void testTokenEquals() {
Token a = new Token("a");
Token aa = new Token("a");
Token b = new Token("b");
assertEquals(a, aa);
assertNotEquals(a, b);
assertNotEquals(aa, b);
}
@Test
public void testTokenEquals() {
Token a = new Token("a");
Token aa = new Token("a");
Token b = new Token("b");
assertEquals(a, aa);
assertNotEquals(a, b);
assertNotEquals(aa, b);
}
@Test
public void testPrefixEquals() {
Prefix a = new Prefix(new Token[] { new Token("a") });
Prefix aa = new Prefix(new Token[] { new Token("a") });
Prefix aaa = new Prefix(Arrays.asList(new Token("a")));
Prefix b = new Prefix(new Token[] { new Token("b") });
assertEquals(a, aa);
assertEquals(aa, aaa);
assertEquals(a, aaa);
assertNotEquals(a, b);
assertNotEquals(aa, b);
assertNotEquals(aaa, b);
}
@Test
public void testPrefixEquals() {
Prefix a = new Prefix(new Token[]{new Token("a")});
Prefix aa = new Prefix(new Token[]{new Token("a")});
Prefix aaa = new Prefix(Arrays.asList(new Token("a")));
Prefix b = new Prefix(new Token[]{new Token("b")});
assertEquals(a, aa);
assertEquals(aa, aaa);
assertEquals(a, aaa);
assertNotEquals(a, b);
assertNotEquals(aa, b);
assertNotEquals(aaa, b);
}
}
package markov.huffman;
import com.tomgibara.bits.Bits;
import markov.stuff.SimpleCountMap;
import markov.stuff.Utils;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.List;
class ByteHuffmanCodeBuilderTest {
@Test
void testByteHuffman() {
SimpleCountMap<Integer> countMap = new SimpleCountMap<>();
for (int i = 0; i < 8; i++) {
for (int j = 0; j < i; j++) {
countMap.count(i);
}
}
ByteHuffmanCodeBuilder<Integer, SimpleCountMap.MutableInt> builder = new ByteHuffmanCodeBuilder<>();
HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> code = builder.generateCode(countMap);
List<Boolean> result = new ArrayList<>();
result.addAll(printVarious(code, 6));
result.addAll(printVarious(code, 2));
result.addAll(printVarious(code, 1));
result.addAll(printVarious(code, 7));
result.addAll(printVarious(code, 4));
System.out.println(result);
byte[] byteArray = Bits.asStore(Utils.fromBitList(result)).toByteArray();
System.out.println(Arrays.toString(byteArray));
String b64 = Base64.getEncoder().encodeToString(byteArray);
System.out.println(b64);
byte[] byteArray2 = Base64.getDecoder().decode(b64);
List<Boolean> list2 = Bits.asStore(byteArray2).asList();
System.out.println(code.decode(list2));
System.out.println(code.decode(result));
// result.addAll(printVarious(code, 3));
// result.addAll(printVarious(code, 4));
// result.addAll(printVarious(code, 5));
// result.addAll(printVarious(code, 6));
// result.addAll(printVarious(code, 7));
// System.out.println(result);
// System.out.println(booleanListAsByteArrayToStirng(result));
}
private List<Boolean> printVarious(HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> code, int n) {
List<Boolean> c = code.encode(n);
System.out.println(c);
System.out.println(booleanListAsByteArrayToStirng(c));
return c;
}
private String booleanListAsByteArrayToStirng(List<Boolean> c) {
return Arrays.toString(Bits.asStore(Utils.fromBitList(c)).toByteArray());
}
}
\ No newline at end of file
package markov.huffman;
import com.tomgibara.bits.BitVector;
import markov.stuff.CountMap;
import org.junit.jupiter.api.Test;
import java.util.BitSet;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
class HuffmanCodeBuilderTest {
// @Test
// void testSimple() {
// CountMap<String> counts = new CountMap<>();
// counts.count("a");
// counts.count("a");
// counts.count("b");
// counts.count("c");
// counts.count("c");
// counts.count("c");
// counts.count("d");
// HuffmanCodeBuilder<String, String, String, CountMap.MutableInt> decoder = new <CountMap.MutableInt>HuffmanCodeBuilder<String, String, String, CountMap.MutableInt>(counts,
// () -> new String(),
// p -> p.getCode() + "0",
// p -> p.getCode() + "1",
// mutableInt -> mutableInt.get());
// Map<String, CountMap.MutableInt> frequencies = counts;
// HuffmanCode<String, String> codes = decoder.generateCode();
// assertEquals("10", codes.encode("a"));
// assertEquals("110", codes.encode("b"));
// assertEquals("0", codes.encode("c"));
// assertEquals("111", codes.encode("d"));
//
// assertEquals("a", codes.decode("10"));
// assertEquals("b", codes.decode("110"));
// assertEquals("c", codes.decode("0"));
// assertEquals("d", codes.decode("111"));
// }
//
//
@Test
public void testBitVectorFromTomgibara() {
BitVector _ = new BitVector(0);
assertEquals(0, _.size());
assertEquals("", _.toString());
BitVector _0 = _.resizedCopy(1, true);
_0.setBit(0, false);
assertEquals(0, _.size());
assertEquals("", _.toString());
assertEquals(1, _0.size());
assertEquals("0", _0.toString());
BitVector _1 = _.resizedCopy(1, true);
_1.setBit(0, true);
assertEquals(0, _.size());
assertEquals("", _.toString());
assertEquals(1, _0.size());
assertEquals("0", _0.toString());
assertEquals(1, _1.size());
assertEquals("1", _1.toString());
}
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment