Below, I'm sorry, but I will leave it in a state where I can not stand reading it for a while as an article ...
Create a class called Utf32Iterator and a class called IteratorInput to parse the string. I will also consider surrogate pairs.
StringParser.java
public class StringParser {
	//Iterator that treats strings as unicode codepoint. Consider surrogate pairs as well.
	public static class Utf32Iterator implements Iterator<Integer> {
		private int position, nextCodePoint = -1;
		private final String source;
		public Utf32Iterator(String source_) {source = source_; position = 0;}
		@Override public boolean hasNext() {
			nextCodePoint = -1; 
			return position <= source.length(); // position == source.length()The case is EOF. Return null and make EndOfInputException after that.
		}
		@Override public Integer next() {
			if (position == source.length()) { //EOF if the length of the string is exactly right. Returns null.
				position ++; 
				return null;
			}
			
			if (nextCodePoint < 0) { //If nextCodePoint is not set, get the character with codePointAt only once and set the value.
				nextCodePoint = source.codePointAt(position); 
				position = source.offsetByCodePoints(position, 1);
			}
			return nextCodePoint;
		}
	}
	//Input class for passing iterator to Parser
	public static class IteratorInput<T> implements Input<T> {
		private final Iterator<T> iterator;
		private final int position;
		private final T current;
		public IteratorInput(Iterator<T> iterator_) {iterator = iterator_; position = 0; current = iterator.hasNext() ? iterator.next(): null;}
		public IteratorInput(Iterator<T> iterator_, int position_) {iterator = iterator_; position = position_; current = iterator.hasNext() ? iterator.next() : null;}
		@Override public T current() {return current;}
		@Override public String positionDescription() {return "" + position;}
		private IteratorInput<T> next = null; //Keep the cache because next may be requested multiple times due to or.
		@Override public Input<T> next() throws EndOfInputException {
			if (next != null) return next;
			if (iterator.hasNext()) return (next = new IteratorInput<T>(iterator, position + 1)); throw new EndOfInputException();
		}
	}
	
	//Stack List of unicode codepoint into a string
	public static Parser<Integer, String> concat(Parser<Integer, List<Integer>> parser) {
		return apply(reduce(parser, () -> new StringBuilder(), (sb, i) -> sb.appendCodePoint(i)), sb -> sb.toString());
	}
	
	public static Parser<Integer, String> concatStr(Parser<Integer, List<String>> parser) {
		return apply(reduce(parser, () -> new StringBuilder(), (sb, i) -> sb.append(i)), sb -> sb.toString());
	}
	
	//Parser that passes only one character contained in str
	public static Parser<Integer, Integer> consistsOf(String str) {return satisfy(i -> str.indexOf(i) >= 0);}
	
	//A parser that consumes the same string as str
	public static Parser<Integer, String> word(String str) {
		List<Parser<Integer, Integer>> result = new ArrayList<>();
		str.chars().forEach(i -> result.add(satisfy(j -> j == i)));
		return concat(lst(result));
	}
	
	public static String codePointToString(int[] codePoint) {return new String(codePoint, 0, codePoint.length);}
	public static String codePointToString(int codePoint) {return codePointToString(new int[] {codePoint});}
	
	//A parser that consumes input until the same string as str appears
	public static Parser<Integer, String> until(String str) {
		ParserMemoizer<Integer, String> result = new ParserMemoizer<Integer, String>();
		result.defun(() -> or(word(str), apply(seq(satisfy(i -> true) /*A parser that consumes any single character*/ , result), tpl2 -> codePointToString(tpl2.car) + tpl2.cdr.car)));
		return result;
	}
}
        Recommended Posts