001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.Reader;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.Charset;
027import java.nio.charset.CharsetEncoder;
028import java.nio.charset.CoderResult;
029import java.nio.charset.CodingErrorAction;
030import java.util.Objects;
031
032import org.apache.commons.io.Charsets;
033import org.apache.commons.io.IOUtils;
034import org.apache.commons.io.build.AbstractStreamBuilder;
035import org.apache.commons.io.charset.CharsetEncoders;
036
037/**
038 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
039 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
040 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
041 * <p>
042 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
043 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
044 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
045 * {@link java.io.BufferedReader}.
046 * </p>
047 * <p>
048 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2}
049 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
050 * </p>
051 *
052 * <pre>
053 * InputStream inputStream = ...
054 * Charset cs = ...
055 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
056 * ReaderInputStream in2 = new ReaderInputStream(reader, cs);
057 * </pre>
058 * <p>
059 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes
060 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
061 * pulls it from the underlying stream.
062 * </p>
063 * <p>
064 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
065 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
066 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
067 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
068 * </p>
069 * <p>
070 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
071 * </p>
072 * <p>
073 * Instances of {@link ReaderInputStream} are not thread safe.
074 * </p>
075 *
076 * @see org.apache.commons.io.output.WriterOutputStream
077 * @since 2.0
078 */
079public class ReaderInputStream extends InputStream {
080
081    /**
082     * Builds a new {@link ReaderInputStream} instance.
083     * <p>
084     * For example:
085     * </p>
086     * <pre>{@code
087     * ReaderInputStream s = ReaderInputStream.builder()
088     *   .setPath(path)
089     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
090     *   .get()}
091     * </pre>
092     * <p>
093     * @since 2.12.0
094     */
095    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
096
097        private CharsetEncoder charsetEncoder = super.getCharset().newEncoder();
098
099        /**
100         * Constructs a new instance.
101         *
102         * @throws UnsupportedOperationException if the origin cannot be converted to a Reader.
103         */
104        @SuppressWarnings("resource")
105        @Override
106        public ReaderInputStream get() throws IOException {
107            return new ReaderInputStream(getOrigin().getReader(getCharset()), charsetEncoder, getBufferSize());
108        }
109
110        @Override
111        public Builder setCharset(final Charset charset) {
112            charsetEncoder = charset.newEncoder();
113            return super.setCharset(charset);
114        }
115
116        /**
117         * Sets the charset encoder.
118         *
119         * @param charsetEncoder the charset encoder.
120         * @return this
121         */
122        public Builder setCharsetEncoder(final CharsetEncoder charsetEncoder) {
123            this.charsetEncoder = charsetEncoder;
124            super.setCharset(charsetEncoder.charset());
125            return asThis();
126        }
127
128    }
129
130    /**
131     * Constructs a new {@link Builder}.
132     *
133     * @return a new {@link Builder}.
134     * @since 2.12.0
135     */
136    public static Builder builder() {
137        return new Builder();
138    }
139
140    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
141        final float minRequired = minBufferSize(charsetEncoder);
142        if (bufferSize < minRequired) {
143            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
144                    charsetEncoder.charset().displayName()));
145        }
146        return bufferSize;
147    }
148
149    static float minBufferSize(final CharsetEncoder charsetEncoder) {
150        return charsetEncoder.maxBytesPerChar() * 2;
151    }
152
153    private final Reader reader;
154
155    private final CharsetEncoder charsetEncoder;
156
157    /**
158     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
159     */
160    private final CharBuffer encoderIn;
161    /**
162     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
163     * caller.
164     */
165    private final ByteBuffer encoderOut;
166
167    private CoderResult lastCoderResult;
168
169    private boolean endOfInput;
170
171    /**
172     * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
173     * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
174     *
175     * @param reader the target {@link Reader}
176     * @deprecated Use {@link ReaderInputStream#builder()} instead
177     */
178    @Deprecated
179    public ReaderInputStream(final Reader reader) {
180        this(reader, Charset.defaultCharset());
181    }
182
183    /**
184     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
185     *
186     * <p>
187     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
188     * </p>
189     *
190     * @param reader  the target {@link Reader}
191     * @param charset the charset encoding
192     * @deprecated Use {@link ReaderInputStream#builder()} instead
193     */
194    @Deprecated
195    public ReaderInputStream(final Reader reader, final Charset charset) {
196        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
197    }
198
199    /**
200     * Constructs a new {@link ReaderInputStream}.
201     *
202     * <p>
203     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
204     * </p>
205     *
206     * @param reader     the target {@link Reader}.
207     * @param charset    the charset encoding.
208     * @param bufferSize the size of the input buffer in number of characters.
209     * @deprecated Use {@link ReaderInputStream#builder()} instead
210     */
211    @Deprecated
212    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
213        // @formatter:off
214        this(reader,
215            Charsets.toCharset(charset).newEncoder()
216                    .onMalformedInput(CodingErrorAction.REPLACE)
217                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
218             bufferSize);
219        // @formatter:on
220    }
221
222    /**
223     * Constructs a new {@link ReaderInputStream}.
224     *
225     * <p>
226     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
227     * an encoder which had already been in use.
228     * </p>
229     *
230     * @param reader         the target {@link Reader}
231     * @param charsetEncoder the charset encoder
232     * @since 2.1
233     * @deprecated Use {@link ReaderInputStream#builder()} instead
234     */
235    @Deprecated
236    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
237        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
238    }
239
240    /**
241     * Constructs a new {@link ReaderInputStream}.
242     *
243     * <p>
244     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
245     * an encoder which had already been in use.
246     * </p>
247     *
248     * @param reader         the target {@link Reader}
249     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
250     * @param bufferSize     the size of the input buffer in number of characters
251     * @since 2.1
252     * @deprecated Use {@link ReaderInputStream#builder()} instead
253     */
254    @Deprecated
255    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
256        this.reader = reader;
257        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
258        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
259        this.encoderIn.flip();
260        this.encoderOut = ByteBuffer.allocate(128);
261        this.encoderOut.flip();
262    }
263
264    /**
265     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
266     *
267     * <p>
268     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
269     * </p>
270     *
271     * @param reader      the target {@link Reader}
272     * @param charsetName the name of the charset encoding
273     * @deprecated Use {@link ReaderInputStream#builder()} instead
274     */
275    @Deprecated
276    public ReaderInputStream(final Reader reader, final String charsetName) {
277        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
278    }
279
280    /**
281     * Constructs a new {@link ReaderInputStream}.
282     *
283     * <p>
284     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
285     * </p>
286     *
287     * @param reader      the target {@link Reader}
288     * @param charsetName the name of the charset encoding, null maps to the default Charset.
289     * @param bufferSize  the size of the input buffer in number of characters
290     * @deprecated Use {@link ReaderInputStream#builder()} instead
291     */
292    @Deprecated
293    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
294        this(reader, Charsets.toCharset(charsetName), bufferSize);
295    }
296
297    /**
298     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
299     *
300     * @throws IOException if an I/O error occurs.
301     */
302    @Override
303    public void close() throws IOException {
304        reader.close();
305    }
306
307    /**
308     * Fills the internal char buffer from the reader.
309     *
310     * @throws IOException If an I/O error occurs
311     */
312    private void fillBuffer() throws IOException {
313        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
314            encoderIn.compact();
315            final int position = encoderIn.position();
316            // We don't use Reader#read(CharBuffer) here because it is more efficient
317            // to write directly to the underlying char array (the default implementation
318            // copies data to a temporary char array).
319            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
320            if (c == EOF) {
321                endOfInput = true;
322            } else {
323                encoderIn.position(position + c);
324            }
325            encoderIn.flip();
326        }
327        encoderOut.compact();
328        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
329        if (endOfInput) {
330            lastCoderResult = charsetEncoder.flush(encoderOut);
331        }
332        if (lastCoderResult.isError()) {
333            lastCoderResult.throwException();
334        }
335        encoderOut.flip();
336    }
337
338    /**
339     * Gets the CharsetEncoder.
340     *
341     * @return the CharsetEncoder.
342     */
343    CharsetEncoder getCharsetEncoder() {
344        return charsetEncoder;
345    }
346
347    /**
348     * Reads a single byte.
349     *
350     * @return either the byte read or {@code -1} if the end of the stream has been reached
351     * @throws IOException if an I/O error occurs.
352     */
353    @Override
354    public int read() throws IOException {
355        for (;;) {
356            if (encoderOut.hasRemaining()) {
357                return encoderOut.get() & 0xFF;
358            }
359            fillBuffer();
360            if (endOfInput && !encoderOut.hasRemaining()) {
361                return EOF;
362            }
363        }
364    }
365
366    /**
367     * Reads the specified number of bytes into an array.
368     *
369     * @param b the byte array to read into
370     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
371     * @throws IOException if an I/O error occurs.
372     */
373    @Override
374    public int read(final byte[] b) throws IOException {
375        return read(b, 0, b.length);
376    }
377
378    /**
379     * Reads the specified number of bytes into an array.
380     *
381     * @param array the byte array to read into
382     * @param off   the offset to start reading bytes into
383     * @param len   the number of bytes to read
384     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
385     * @throws IOException if an I/O error occurs.
386     */
387    @Override
388    public int read(final byte[] array, int off, int len) throws IOException {
389        Objects.requireNonNull(array, "array");
390        if (len < 0 || off < 0 || off + len > array.length) {
391            throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
392        }
393        int read = 0;
394        if (len == 0) {
395            return 0; // Always return 0 if len == 0
396        }
397        while (len > 0) {
398            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
399                final int c = Math.min(encoderOut.remaining(), len);
400                encoderOut.get(array, off, c);
401                off += c;
402                len -= c;
403                read += c;
404            } else if (endOfInput) { // Already reach EOF in the last read
405                break;
406            } else { // Read again
407                fillBuffer();
408            }
409        }
410        return read == 0 && endOfInput ? EOF : read;
411    }
412}