Creative commons Attribution-NonCommercial license. NO WARRANTY IS GIVEN OR IMPLIED, USE AT YOUR OWN RISK.
 package base.text;
Best effort attempt to convert a simple HTML page into plain text. Headings become underlined text, paragraphs and breaks are converted into a single line of white space, and paragraphs are reformatted to have a maximum width of 76 characters.
public class HtmlToText {
	private static final Pattern PAGE_BODY = Pattern.compile(".*<body>(.*)</body>.*".|.|.);
	public static String convert(String html) {
		String text = html;
		Matcher pageMatch=.matcher(text);
		if(pageMatch.matches()) {;
		text = text.replaceAll("</[h|H]1>""\r\n====================================\r\n");
		text = text.replaceAll("</[h|H]2>""\r\n------------------------------------\r\n");
		text = text.replaceAll("</[h|H]3>""\r\n------------------------------------\r\n");
		text = text.replaceAll("<[b|B][r|R][^>]*>""\r\n");
		text = text.replaceAll("</[p|P]>""\r\n\r\n");
		text = text.replaceAll("<[^>]*>""");
		String[] lines = text.split("\\n");
		StringBuffer buffer = new StringBuffer();
		boolean wasBlank = true;
		for(String line : lines) {
			String item = line.trim();
			if(item.length() == 0) {
				if(wasBlank) {
else {
					wasBlank = true;
else {
				wasBlank = false;
		TextReformat reformatter = new TextReformat();
		return reformatter.format(buffer.toString(), 76);
