java - Extracting and Printing text positions -
i've been doing experiments on pdfbox , i'm stuck on issue suspect has coordinate system.
i'm extending pdftextstripper x , y of each character in pdf page.
creating image imageio printing text @ position received, , putting little mark (rectangles different colors) on bottom of each reference wanted, , seemed well. avoid losing style pdf wanted overlay pdf , adding spoken marks, coordinates got don't match in pdpagecontentstream.
on matching pdf coordinates pdftextstripper -> processtextposition visual coordinates
using version 1.8.11
as discussed in comments, 1.8 version of drawprinttextlocations tool part of examples collections of 2.0 version , based on better known printtextlocations example. unlike 2.0 version, 1 not show font bounding boxes, text extraction sizes, height of small glyph (a, e, etc). used heuristic tool text extraction. cause "the textpositions i'm getting halfed" effect here. if need bounding boxes, better use 2.0 (which may big). exact sizes, have calculate path of each glyph , bounds of one, again, you'd need 2.0 version one.
public class drawprinttextlocations extends pdftextstripper { private bufferedimage image; private final string filename; static final int scale = 4; private graphics2d g2d; private final pddocument document; /** * instantiate new pdftextstripper object. * * @param document * @param filename * @throws ioexception if there error loading properties. */ public drawprinttextlocations(pddocument document, string filename) throws ioexception { this.document = document; this.filename = filename; } /** * print documents data. * * @param args command line arguments. * * @throws ioexception if there error parsing document. */ public static void main(string[] args) throws ioexception { if (args.length != 1) { usage(); } else { pddocument document = null; try { document = pddocument.load(new file(args[0])); drawprinttextlocations stripper = new drawprinttextlocations(document, args[0]); stripper.setsortbyposition(true); (int page = 0; page < document.getnumberofpages(); ++page) { stripper.strippage(page); } } { if (document != null) { document.close(); } } } } private void strippage(int page) throws ioexception { pdpage pdpage = (pdpage) document.getdocumentcatalog().getallpages().get(page); image = pdpage.converttoimage(bufferedimage.type_int_rgb, 72 * scale); pdrectangle cropbox = pdpage.getcropbox(); g2d = image.creategraphics(); g2d.setstroke(new basicstroke(0.1f)); g2d.scale(scale, scale); setstartpage(page + 1); setendpage(page + 1); writer dummy = new outputstreamwriter(new bytearrayoutputstream()); writetext(document, dummy); // beads in green g2d.setstroke(new basicstroke(0.4f)); list<pdthreadbead> pagearticles = pdpage.getthreadbeads(); (pdthreadbead bead : pagearticles) { pdrectangle r = bead.getrectangle(); generalpath p = transform(r, matrix.gettranslatinginstance(-cropbox.getlowerleftx(), cropbox.getlowerlefty())); affinetransform flip = new affinetransform(); flip.translate(0, pdpage.findcropbox().getheight()); flip.scale(1, -1); shape s = flip.createtransformedshape(p); g2d.setcolor(color.green); g2d.draw(s); } g2d.dispose(); string imagefilename = filename; int pt = imagefilename.lastindexof('.'); imagefilename = imagefilename.substring(0, pt) + "-marked-" + (page + 1) + ".png"; imageio.write(image, "png", new file(imagefilename)); } /** * override default functionality of pdftextstripper. */ @override protected void writestring(string string, list<textposition> textpositions) throws ioexception { (textposition text : textpositions) { system.out.println("string[" + text.getxdiradj() + "," + text.getydiradj() + " fs=" + text.getfontsize() + " xscale=" + text.getxscale() + " height=" + text.getheightdir() + " space=" + text.getwidthofspace() + " width=" + text.getwidthdiradj() + "]" + text.getcharacter()); // in red: // show rectangles "height" (not real height, used text extraction // heuristics, 1/2 of bounding box height , starts @ y=0) rectangle2d.float rect = new rectangle2d.float( text.getxdiradj(), (text.getydiradj() - text.getheightdir()), text.getwidthdiradj(), text.getheightdir()); g2d.setcolor(color.red); g2d.draw(rect); } } /** * print usage document. */ private static void usage() { system.err.println("usage: java " + drawprinttextlocations.class.getname() + " <input-pdf>"); } /** * transforms given point matrix. * * @param x x-coordinate * @param y y-coordinate */ private point2d.float transformpoint(matrix m, float x, float y) { float[][] values = m.getvalues(); float = values[0][0]; float b = values[0][1]; float c = values[1][0]; float d = values[1][1]; float e = values[2][0]; float f = values[2][2]; return new point2d.float(x * + y * c + e, x * b + y * d + f); } /** * returns path represents rectangle having been transformed given matrix. * note resulting path need not rectangular. */ private generalpath transform(pdrectangle r, matrix matrix) { float x1 = r.getlowerleftx(); float y1 = r.getlowerlefty(); float x2 = r.getupperrightx(); float y2 = r.getupperrighty(); point2d.float p0 = transformpoint(matrix, x1, y1); point2d.float p1 = transformpoint(matrix, x2, y1); point2d.float p2 = transformpoint(matrix, x2, y2); point2d.float p3 = transformpoint(matrix, x1, y2); generalpath path = new generalpath(); path.moveto((float) p0.getx(), (float) p0.gety()); path.lineto((float) p1.getx(), (float) p1.gety()); path.lineto((float) p2.getx(), (float) p2.gety()); path.lineto((float) p3.getx(), (float) p3.gety()); path.closepath(); return path; } }
Comments
Post a Comment