/* * Created on Dec 27, 2006 * */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; public class FieldsToSpreadSheet { static String fieldNameOf(String s) { int colonAt = s.indexOf(":"); int spaceAt = s.indexOf(" "); if (colonAt == -1) return null; if (spaceAt == -1 || spaceAt > colonAt) return s.substring(0, colonAt).trim(); return null; } static String dataOf(String s) { int colonAt = s.indexOf(":"); if (colonAt == -1) return null; return s.substring(colonAt+1).trim(); } static String readNonEmptyLine(BufferedReader r) throws IOException { String s = r.readLine(); while (s != null && s.trim().length() == 0 && !endOfRecord(s)) { s = r.readLine(); } return s; } /** * @param s * @return */ private static boolean endOfRecord(String s) { return s.length() > 0 && s.charAt(0) == '\f'; } /** * Removes tabs from data (to avoid confusing the tab-separated form). * @param s * @return */ static String tabFree(String s) { if (! s.contains("\t")) return s; return s.replaceAll("\t", " "); } /** * Reads a series of field_name: prefixed lines, and enters those lines into * map m. A line not prefixed with a space-free field name is assumed to be * part of the next line's data. A record ends with a line containing a line-feed * as its first character, and the rest of that line is be ignored. * * @param r * @param m * @return * @throws IOException */ static Map readRecord(BufferedReader r, List fields, Set fieldsSeen) throws IOException { String s = readNonEmptyLine(r); if (s == null ) return null; String f = fieldNameOf(s); String data = dataOf(s); s = readNonEmptyLine(r); Map m = new HashMap(); while (s != null && !endOfRecord(s)) { String nf = fieldNameOf(s); if (nf == null) { data += " " + s.trim(); } else { if (! fieldsSeen.contains(f)) { fields.add(f); fieldsSeen.add(f); } m.put(f, tabFree(data)); f = nf; data = dataOf(s); } s = readNonEmptyLine(r); } if (! fieldsSeen.contains(f)) { fields.add(f); fieldsSeen.add(f); } m.put(f, tabFree(data)); return m; } static List> readFile(String file_name, ArrayList fields, Set fieldsSeen) throws Throwable { ArrayList> a = new ArrayList>(); BufferedReader r = new BufferedReader(new FileReader(file_name)); Map m = readRecord(r, fields, fieldsSeen); while (m != null) { a.add(m); m = readRecord(r, fields, fieldsSeen); } return a; } public static void main(String[] args) throws Throwable { // java FieldsToSpreadSheets field_file if (args.length != 2) { System.err .println("Usage: java FieldsToSpreadSheets fields_file tabs_file\n" + "Reads field_name_prefixed lines from main, and acumulates them into a tab-separated spreadsheet.\n" + "A field_name is a leading string containing no spaces followed by a colon. The data is everything betweeen that and the next field name." + "Records are separated by lines containing only a line feed." ); System.exit(1); } ArrayList fields = new ArrayList(); HashSet fieldset = new HashSet(); List> a = readFile(args[0], fields, fieldset); BufferedWriter result = new BufferedWriter(new FileWriter(args[1])); writeRecords(result, a, fields); result.close(); } private static void writeRecords(BufferedWriter result, List> a, ArrayList fields) throws IOException { String sep = ""; for (String f : fields) { result.write(sep); result.write(f); sep = "\t"; } result.newLine(); int i = 0; for (Map m : a) { sep = ""; for (String f : fields) { result.write(sep); String r = m.get(f); if (r == null) { r = ""; } result.write(r.trim()); sep = "\t"; } i++; result.newLine(); } } }