Commit 218bdd37 by Hut

some cleanup

parent 0d976c4d
......@@ -17,7 +17,11 @@
</properties>
<dependencies>
<dependency>
<groupId>directory.passive</groupId>
<artifactId>huffman</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>com.tomgibara.bits</groupId>
<artifactId>bits</artifactId>
......@@ -35,6 +39,12 @@
<version>${junit.jupiter.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>${junit.jupiter.version}</version>
<scope>test</scope>
</dependency>
<!-- To avoid compiler warnings about @API annotations in JUnit code -->
<dependency>
<groupId>org.apiguardian</groupId>
......
......@@ -23,7 +23,6 @@ public class Data implements Serializable{
public void add(Prefix p, Token t) {
Lookup l = data.getOrDefault(p, new Lookup());
// System.out.println(String.format("adding %s - %s", p, t, l));
l.add(t);
data.put(p, l);
}
......
......@@ -4,6 +4,7 @@ import java.io.Serializable;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
......@@ -57,8 +58,9 @@ public class Lookup implements Serializable {
throw new IllegalStateException("not jet finished");
}
if (id < 0 || id > finishedSums.length) {
throw new IllegalArgumentException(
String.format("got %d but expected id [0; %d)", id, finishedSums.length));
throw new IllegalArgumentException(String.format("got %d but expected id [0; %d)",
id,
finishedSums.length));
}
return finalData[id];
}
......@@ -79,20 +81,18 @@ public class Lookup implements Serializable {
int size = tokens.size();
finishedSums = new long[size];
finalData = new Decission[size];
final int[] i = new int[1];
i[0] = 0;
final long[] sum = new long[1];
sum[0] = 0;
tokens.entrySet()
.stream()
.sequential()
.sorted(Comparator.comparingInt(Entry::getValue))
.forEach(e -> {
sum[0] += e.getValue();
finishedSums[i[0]] = sum[0];
finalData[i[0]] = new Decission(e.getKey(), i[0], this);
i[0]++;
});
List<Entry<Token, Integer>> orderedEntries = tokens.entrySet()
.stream()
.sequential()
.sorted(Comparator.comparingInt(Entry::getValue))
.collect(Collectors.toList());
int sum = 0;
for (int i = 0; i < orderedEntries.size(); i++) {
Entry<Token, Integer> entry = orderedEntries.get(i);
sum += entry.getValue();
finishedSums[i] = sum;
finalData[i] = new Decission(entry.getKey(), i, this);
}
this.tokens.clear();
isFinishedCollecting = true;
}
......@@ -100,10 +100,10 @@ public class Lookup implements Serializable {
@Override
public String toString() {
return "Lookup [tokens= " + tokens.entrySet().stream().sorted(
(e1, e2) -> Integer.compare(e2.getValue(), e1.getValue())).map(
e -> String.format("%d*%s", e.getValue(), e.getKey())).collect(
Collectors.joining(", ")) + "]";
return "Lookup [tokens= " + tokens.entrySet().stream().sorted((e1, e2) -> Integer.compare(e2
.getValue(), e1.getValue())).map(e -> String.format("%d*%s",
e.getValue(),
e.getKey())).collect(Collectors.joining(", ")) + "]";
}
/**
......
package markov;
import markov.huffman.HuffmanCode;
import directory.passive.huffman.HuffmanCode;
import markov.stuff.BitConverter;
import markov.stuff.SimpleCountMap;
import markov.stuff.Utils;
import java.util.Base64;
......@@ -14,12 +13,11 @@ import java.util.stream.Stream;
public class ShortenerByteHuffmanImpl extends ShortenerSimpleImpl {
private final HuffmanCode<Byte, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode;
private final HuffmanCode<Byte, List<Boolean>> byteCode;
public ShortenerByteHuffmanImpl(
Data data,
HuffmanCode<Byte, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode) {
Data data, HuffmanCode<Byte, List<Boolean>> byteCode) {
super(data);
this.byteCode = byteCode;
}
......
package markov;
import markov.huffman.HuffmanCode;
import directory.passive.huffman.HuffmanCode;
import markov.stuff.BitConverter;
import markov.stuff.SimpleCountMap;
import java.util.Base64;
import java.util.List;
......@@ -12,12 +11,11 @@ import java.util.stream.Stream;
public class ShortenerIntHuffmanImpl extends ShortenerSimpleImpl {
private final HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> intCode;
private final HuffmanCode<Integer, List<Boolean>> intCode;
public ShortenerIntHuffmanImpl(
Data data,
HuffmanCode<Integer, List<Boolean>, Boolean, SimpleCountMap.MutableInt> byteCode) {
Data data, HuffmanCode<Integer, List<Boolean>> byteCode) {
super(data);
this.intCode = byteCode;
}
......
......@@ -10,15 +10,45 @@ public class Tokenizer {
public Stream<Stream<Token>> tokenizeCombined(Stream<Stream<String>> input) {
return input.map(stringStream -> putMarkers(combineToTokens(stringStream.flatMap(s -> s.codePoints().boxed().map(
return input.map(stringStream -> putMarkers(combineToTokens(stringStream.flatMap(s -> s
.codePoints()
.boxed()
.map(this::glyphFromCodePoint)))));
}
private Stream<Token> putMarkers(Stream<Token> input) {
return Stream.concat(Stream.of(Token.START), Stream.concat(input, Stream.of(Token.END)));
}
this::glyphFromCodePoint)))));
public Stream<Token> combineToTokens(Stream<Glyph> glyphs) {
final Container[] previous = new Container[]{null};
return Stream.concat(glyphs, Stream.of(new Glyph(Glyph.Type.empty, ""))).map(g -> {
Container o = new Container(g);
o.p = previous[0];
previous[0] = o;
return o;
}).flatMap(go -> {
if (go.p != null && go.p.self != null &&
!go.self.getType().equals(go.p.self.getType())) {
List<Glyph> tokenGlyphs = new ArrayList<>();
Container c = go;
do {
c = c.p;
tokenGlyphs.add(0, c.self);
} while (c != null && c.p != null && c.p.self != null &&
c.self.getType().equals(c.p.self.getType()));
go.p = null; // memory optimization
return Stream.<List<Glyph>>builder().add(tokenGlyphs).build();
} else {
return Stream.empty();
}
}).map(this::getTokenFromGlyphs);
}
public Stream<Stream<Token>> tokenize(Stream<String> input) {
return input.map(s -> putMarkers(combineToTokens(s.codePoints().boxed().map(
this::glyphFromCodePoint))));
return input.map(s -> putMarkers(combineToTokens(s.codePoints()
.boxed()
.map(this::glyphFromCodePoint))));
}
private Glyph glyphFromCodePoint(int codePoint) {
......@@ -27,7 +57,11 @@ public class Tokenizer {
type = Glyph.Type.whitespace;
} else if (Character.isAlphabetic(codePoint)) {
type = Glyph.Type.word;
} else if (Arrays.asList(Character.START_PUNCTUATION, Character.END_PUNCTUATION, Character.INITIAL_QUOTE_PUNCTUATION, Character.FINAL_QUOTE_PUNCTUATION,
} else if (Arrays.asList(
Character.START_PUNCTUATION,
Character.END_PUNCTUATION,
Character.INITIAL_QUOTE_PUNCTUATION,
Character.FINAL_QUOTE_PUNCTUATION,
Character.OTHER_PUNCTUATION).contains(Character.getType(codePoint))) {
type = Glyph.Type.punctuation;
}
......@@ -35,36 +69,6 @@ public class Tokenizer {
return new Glyph(type, value);
}
private Stream<Token> putMarkers(Stream<Token> input) {
return Stream.concat(Stream.of(Token.START), Stream.concat(input, Stream.of(Token.END)));
}
public Stream<Token> combineToTokens(Stream<Glyph> glyphs) {
final Container[] previous = new Container[]{null};
return Stream.concat(glyphs, Stream.of(new Glyph(Glyph.Type.empty, "")))
.map(g -> {
Container o = new Container(g);
o.p = previous[0];
previous[0] = o;
return o;
}).flatMap(go -> {
if (go.p != null && go.p.self != null && !go.self.getType().equals(go.p.self.getType())) {
List<Glyph> tokenGlyphs = new ArrayList<>();
Container c = go;
do {
c = c.p;
tokenGlyphs.add(0, c.self);
}
while (c != null && c.p != null && c.p.self != null &&
c.self.getType().equals(c.p.self.getType()));
go.p = null; // memory optimization
return Stream.<List<Glyph>>builder().add(tokenGlyphs).build();
} else {
return Stream.empty();
}
}).map(this::getTokenFromGlyphs);
}
private Token getTokenFromGlyphs(List<Glyph> l) {
String content = l.stream().map(Glyph::getContent).collect(Collectors.joining());
Glyph.Type type = l.get(0).getType();
......@@ -72,11 +76,11 @@ public class Tokenizer {
}
private static class Container {
final Glyph self;
Container p;
Container(Glyph self) {
this.self = self;
}
final Glyph self;
Container p;
}
}
package markov.huffman;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class BitConverter {
public byte[] toBytes(List<Boolean> list) {
int listLength = list.size();
int lengthInBit = listLength + 3;
int arrayLength = Math.max(1, (int) Math.ceil(lengthInBit / 8d));
byte[] bytes = new byte[arrayLength];
int lastByteNotEmpty = lengthInBit % 8;
// temp variable to not constantly write to heap
byte tmpByte = ((byte) lastByteNotEmpty);
int i = 0;
// set 3 bits for the lastByteNotEmpty number
for (int j = 3; j < 8 && i < listLength; j++, i++) {
tmpByte = setBit(list, tmpByte, i, j);
}
bytes[0] = tmpByte;
// set rest of the bits
for (int j = 1; j < arrayLength; j++) {
tmpByte = 0;
for (int l = 0; l < 8 && i < listLength; l++, i++) {
tmpByte = setBit(list, tmpByte, i, l);
}
bytes[j] = tmpByte;
}
return bytes;
}
private byte setBit(List<Boolean> list, byte tmpByte, int i, int j) {
tmpByte |= (byte) (list.get(i) ? 1 : 0) << j;
return tmpByte;
}
public List<Boolean> toBits(byte[] array) {
int arrayLength = array.length;
if (arrayLength == 0) {
return Collections.emptyList();
}
// determine lastByteNotEmpty bits
byte tmpByte = array[0];
int lastByteNotEmpty = tmpByte & 0b00000111;
int listSize = 8 * arrayLength + (lastByteNotEmpty == 0 ? -3 : lastByteNotEmpty - 11);
if (arrayLength == 1 && lastByteNotEmpty < 4 && lastByteNotEmpty != 0) {
throw new IllegalArgumentException("corrupted data");
}
return toListPrimitive(listSize, array);
}
private static List<Boolean> toListPrimitive(int listSize, byte[] array) {
List<Boolean> list = new ArrayList<>(listSize);
for (int i = 3; i < listSize + 3; i++) {
list.add((array[i / 8] & 1 << (i % 8)) >> i % 8 == 1);
}
return list;
}
}
package markov.huffman;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Supplier;
public class ByteHuffmanCodeBuilder<ContentType, FrequencyType extends FrequenceType<FrequencyType>> extends HuffmanCodeBuilder<ContentType, List<Boolean>, Boolean, FrequencyType> {
private static final Supplier<List<Boolean>> rootCodeSupplier = ArrayList::new;
private static final Supplier<Boolean> leftGlyph = () -> false;
private static final Supplier<Boolean> rightGlyph = () -> true;
private static final BiFunction<List<Boolean>, Boolean, List<Boolean>> combiner = (l, g) -> {
l = new ArrayList<>(l);
l.add(g);
return l;
};
private static final Function<List<Boolean>, Iterator<Boolean>> splitter = List::iterator;
public ByteHuffmanCodeBuilder() {
super(rootCodeSupplier, leftGlyph, rightGlyph, combiner, splitter, (n, b) -> b ? n.getRight() : n.getLeft());
}
}
package markov.huffman;
public interface FrequenceType<X extends FrequenceType> extends Comparable<X> {
X add(X b);
boolean isGreaterZero();
}
package markov.huffman;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import java.util.function.Function;
public class HuffmanCode<ContentType, SequenceType, GlyphType, FrequencyType extends FrequenceType<FrequencyType>> implements Serializable {
private final Map<ContentType, SequenceType> codes;
private final HuffmanTree<ContentType, SequenceType, FrequencyType> tree;
private final Function<SequenceType, Iterator<GlyphType>> splitter;
private final BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider;
public HuffmanCode(Map<ContentType, SequenceType> codes,
HuffmanTree<ContentType, SequenceType, FrequencyType> tree,
Function<SequenceType, Iterator<GlyphType>> splitter,
BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType,
HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider) {
this.codes = codes;
this.tree = tree;
this.splitter = splitter;
this.leftRightDecider = leftRightDecider;
}
public SequenceType encode(ContentType t) {
return codes.get(t);
}
public List<ContentType> decode(SequenceType v) {
List<ContentType> ret = new ArrayList<>();
Iterator<GlyphType> iterator = splitter.apply(v);
HuffmanTree<ContentType, SequenceType, FrequencyType> node = tree;
while (iterator.hasNext()) {
GlyphType glyph = iterator.next();
node = leftRightDecider.apply((HuffmanNode<ContentType, SequenceType, FrequencyType>) node, glyph);
if (node instanceof HuffmanLeaf) {
ret.add(((HuffmanLeaf<ContentType, SequenceType, FrequencyType>) node).getValue());
node = tree;
}
}
return ret;
}
}
\ No newline at end of file
package markov.huffman;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Supplier;
/**
* Used to build a HUffman Code instance to code a given ContentType into a SequenceType,
* and decode a sequence of SequenceTypes into a sequence of ContentTypes,
* using the given Frequency type to determine the distribution.
*
* @param <ContentType>
* @param <SequenceType>
* @param <GlyphType>
* @param <FrequencyType>
*/
public class HuffmanCodeBuilder<ContentType, SequenceType, GlyphType, FrequencyType extends FrequenceType<FrequencyType>> {
private final Supplier<SequenceType> rootCodeSupplier;
private final Supplier<GlyphType> leftGlyph;
private final Supplier<GlyphType> rightGlyph;
private final BiFunction<SequenceType, GlyphType, SequenceType> combiner;
private final Function<SequenceType, Iterator<GlyphType>> splitter;
private final BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>,
GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider;
private HuffmanNode<ContentType, SequenceType, FrequencyType> tree;
private final Map<ContentType, SequenceType> codes = new HashMap<>();
public HuffmanCodeBuilder(Supplier<SequenceType> rootCodeSupplier, Supplier<GlyphType> leftGlyph, Supplier<GlyphType> rightGlyph, BiFunction<SequenceType, GlyphType, SequenceType> combiner, Function<SequenceType, Iterator<GlyphType>> splitter, BiFunction<HuffmanNode<ContentType, SequenceType, FrequencyType>, GlyphType, HuffmanTree<ContentType, SequenceType, FrequencyType>> leftRightDecider) {
this.rootCodeSupplier = rootCodeSupplier;
this.leftGlyph = leftGlyph;
this.rightGlyph = rightGlyph;
this.combiner = combiner;
this.splitter = splitter;
this.leftRightDecider = leftRightDecider;
}
public HuffmanCode<ContentType, SequenceType, GlyphType, FrequencyType> generateCode(Map<ContentType, FrequencyType> frequencies) {
tree = buildTree(frequencies);
tree.setCode(rootCodeSupplier.get());
generateCodes(tree);
return new HuffmanCode<>(codes, tree, splitter, leftRightDecider);
}
private HuffmanNode<ContentType, SequenceType, FrequencyType> buildTree(Map<ContentType, FrequencyType> frequencies) {
PriorityQueue<HuffmanTree<ContentType, SequenceType, FrequencyType>> trees = new PriorityQueue<>();
for (Map.Entry<ContentType, FrequencyType> e : frequencies.entrySet()) {
if (e.getValue().isGreaterZero())
trees.offer(new HuffmanLeaf<>(e.getValue(), e.getKey()));
}
while (trees.size() > 1) {
HuffmanTree a = trees.poll();
HuffmanTree b = trees.poll();
trees.offer(new HuffmanNode<ContentType, SequenceType, FrequencyType>(a, b));
}
return (HuffmanNode<ContentType, SequenceType, FrequencyType>) trees.poll();
}
private void generateCodes(HuffmanTree<ContentType, SequenceType, FrequencyType> child) {
if (child == null)
throw new IllegalArgumentException(new NullPointerException());
if (child instanceof HuffmanLeaf) {
HuffmanLeaf<ContentType, SequenceType, FrequencyType> leaf = (HuffmanLeaf<ContentType, SequenceType, FrequencyType>) child;
codes.put(leaf.getValue(), leaf.getCode());
} else if (child instanceof HuffmanNode) {
HuffmanNode<ContentType, SequenceType, FrequencyType> node = (HuffmanNode<ContentType, SequenceType, FrequencyType>) child;
// traverse left
HuffmanTree<ContentType, SequenceType, FrequencyType> left = node.getLeft();
SequenceType leftCode = combiner.apply(child.getCode(), leftGlyph.get());
left.setCode(leftCode);
generateCodes(left);
// traverse right
HuffmanTree<ContentType, SequenceType, FrequencyType> right = node.getRight();
SequenceType rightCode = combiner.apply(child.getCode(), rightGlyph.get());
right.setCode(rightCode);
generateCodes(right);
}
}
}
package markov.huffman;
class HuffmanLeaf<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>> extends
HuffmanTree<ContentType, SequenceType, FrequencyType> {
private final ContentType value; // the character this leaf represents
ContentType getValue() {
return value;
}
HuffmanLeaf(FrequencyType freq, ContentType val) {
super(freq);
value = val;
}
}
package markov.huffman;
class HuffmanNode<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>>
extends HuffmanTree<ContentType, SequenceType, FrequencyType> {
private final HuffmanTree<ContentType, SequenceType, FrequencyType> left;
private final HuffmanTree<ContentType, SequenceType, FrequencyType> right;
HuffmanTree<ContentType, SequenceType, FrequencyType> getLeft() {
return left;
}
HuffmanTree<ContentType, SequenceType, FrequencyType> getRight() {
return right;
}
HuffmanNode(HuffmanTree<ContentType, SequenceType, FrequencyType> l,
HuffmanTree<ContentType, SequenceType, FrequencyType> r) {
super(l.getFrequency().add(r.getFrequency()));
left = l;
right = r;
}
}
package markov.huffman;
abstract class HuffmanTree<ContentType, SequenceType, FrequencyType extends FrequenceType<FrequencyType>>
implements Comparable<HuffmanTree<ContentType, SequenceType, FrequencyType>> {
private final FrequencyType frequency;
private SequenceType code = null;
HuffmanTree(FrequencyType freq) {
frequency = freq;
}
FrequencyType getFrequency() {
return frequency;
}
public SequenceType getCode() {
return code;
}
public void setCode(SequenceType code) {
this.code = code;
}
@Override
public int compareTo(HuffmanTree<ContentType, SequenceType, FrequencyType> o) {
return this.frequency.compareTo(o.frequency);
}
}
package markov.stuff;
import markov.huffman.FrequenceType;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Stream;
public class SimpleCountMap<T> extends HashMap<T, SimpleCountMap.MutableInt> implements CountMap<T> {
public static class MutableInt implements FrequenceType<MutableInt>, Serializable {
private int value = 1;
public MutableInt(int value) {
this.value = value;
}
public void increment() {
++value;
}
public int get() {
return value;
}
@Override
public MutableInt add(MutableInt b) {
return new MutableInt(get() + b.get());
}
@Override
public boolean isGreaterZero() {
return get() > 0;
}
@Override
public int compareTo(MutableInt o) {
return Integer.compare(get(), o.get());
}
}
public class SimpleCountMap<T> extends HashMap<T, SimpleCountMap.MutableInt>
implements CountMap<T> {
public SimpleCountMap(int initialCapacity, float loadFactor) {
super(initialCapacity, loadFactor);
}
public SimpleCountMap(int initialCapacity) {
super(initialCapacity);
}
......@@ -66,11 +35,49 @@ public class SimpleCountMap<T> extends HashMap<T, SimpleCountMap.MutableInt> imp
@Override
public Stream<Map.Entry<T, MutableInt>> result() {
return entrySet().stream().sorted((e1, e2) -> Integer.compare(e2.getValue().get(), e1.getValue().get()));
return entrySet().stream().sorted((e1, e2) -> Integer.compare(e2.getValue().get(),
e1.getValue().get()));
}
@Override
public HashMap<T, MutableInt> asMap() {
return this;
}
public static class MutableInt implements Serializable {
private int value = 1;
public MutableInt(int value) {
this.value = value;
}
public void increment() {
++value;
}
public int get() {
return value;
}
public static class FrequencySupport
implements directory.passive.huffman.FrequencySupport<MutableInt> {
@Override
public MutableInt add(
MutableInt mutableInt, MutableInt x1) {
return new MutableInt(mutableInt.get() + x1.get());
}
@Override
public int compare(MutableInt mutableInt, MutableInt x1) {
return Integer.compare(mutableInt.get(), x1.get());
}
@Override
public boolean isGreaterZero(MutableInt mutableInt) {
return mutableInt.get() > 0;
}
}
}
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment