Skip to content

Commit

Permalink
Main.java: Specify a password to decrypt PDFs with -p.
Browse files Browse the repository at this point in the history
  • Loading branch information
kjw committed Aug 18, 2010
1 parent 459a2c9 commit 02bf718
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 24 deletions.
1 change: 1 addition & 0 deletions .classpath
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
<classpathentry kind="lib" path="lib/pdfbox-1.1.0.jar" sourcepath="lib/pdfbox-1.1.0-src.zip"/>
<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.1.0.jar"/>
<classpathentry kind="lib" path="lib/args4j-2.0.11.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>
1 change: 1 addition & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
<zipfileset src="${lib.dir}/pdfbox-1.1.0.jar" excludes="META-INF"/>
<zipfileset src="${lib.dir}/fontbox-1.1.0.jar" excludes="META-INF"/>
<zipfileset src="${lib.dir}/commons-logging-1.1.1.jar" excludes="META-INF"/>
<zipfileset src="${lib.dir}/args4j-2.0.11.jar" excludes="META-INF"/>
<manifest>
<attribute name="Main-Class"
value="org.crossref.pdf2xml.Main"/>
Expand Down
Binary file added lib/args4j-2.0.11.jar
Binary file not shown.
78 changes: 59 additions & 19 deletions src/org/crossref/pdf2xml/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,43 @@
import java.util.ArrayList;
import java.util.List;

import javax.swing.JFrame;
import javax.swing.JScrollPane;

import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.exceptions.WrappedIOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageNode;
import org.crossref.pdf2xml.visual.PageCanvas;

import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

public class Main {

@Option(name="-p", usage="Specify an optional decryption password.",
required=false, multiValued=false, metaVar="PASSWD")
private String password = "";

@Argument
private List<String> filenames = new ArrayList<String>();

private static TextExtractor parsePdf(File f) throws IOException {
private TextExtractor parsePdf(File f) throws IOException {
PDDocument doc = PDDocument.load(f);

if(doc.isEncrypted()) {
// Some documents are encrypted with the empty password. Try
// to decrypt with this password, or the one passed in on the
// command line (if any), and fail if we can't.
try {
doc.decrypt(password); // Defaults to the empty string.
} catch (CryptographyException e) {
throw new WrappedIOException("Can't decrypt document: ", e);
} catch (InvalidPasswordException e) {
throw new WrappedIOException("Document is encrypted: ", e);
}
}

PDDocumentCatalog docCat = doc.getDocumentCatalog();

PDPageNode root = docCat.getPages();
Expand All @@ -41,19 +64,36 @@ private static TextExtractor parsePdf(File f) throws IOException {
return te;
}

public static void main(String[] filenames) {
for (String filename : filenames) {
File inputFile = new File(filename);
TextExtractor te = null;

try {
te = parsePdf(inputFile);
System.out.println(te.toXml());
} catch (IOException e) {
System.err.println("Couldn't read file '" + inputFile +"'.");
System.exit(1);
}
}
private void doMain() {
for (String filename : filenames) {
File inputFile = new File(filename);
TextExtractor te = null;

try {
te = parsePdf(inputFile);
System.out.println(te.toXml());
} catch (IOException e) {
System.err.println("Couldn't read file '" + inputFile +"'.");
System.exit(1);
}
}
}

public static void main(String[] args) {
Main m = new Main();
CmdLineParser parser = new CmdLineParser(m);

if (args.length == 0) {
System.err.println("Usage: pdf2xml [options] <FILEs>");
parser.printUsage(System.err);
} else {
try {
parser.parseArgument(args);
m.doMain();
} catch (CmdLineException e) {
parser.printUsage(System.err);
}
}
}

}
6 changes: 2 additions & 4 deletions src/org/crossref/pdf2xml/TextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

/**
* Extract text from a PDF document with position and style information.
* This class attempts to coalease runs of text on the page; that is,
* This class attempts to coalesce runs of text on the page; that is,
*
* @author Karl Ward
*/
Expand Down Expand Up @@ -67,9 +67,7 @@ public void processStream(PDPage aPage, PDResources resources,

protected void processTextPosition(TextPosition tp) {
PDGraphicsState gs = getGraphicsState();

Text newT = Text.newFor(tp, gs);
currentPage.addText(newT);
currentPage.addText(Text.newFor(tp, gs));
}

private void coalesceRows(Page page) {
Expand Down
6 changes: 5 additions & 1 deletion src/org/crossref/pdf2xml/data/Text.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import java.io.IOException;

import org.apache.fontbox.cmap.CMap;
import org.apache.pdfbox.encoding.Encoding;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
Expand All @@ -10,7 +12,9 @@
import org.apache.pdfbox.util.TextPosition;

public class Text implements Comparable<Text> {
private float x, baseline, width, height, pointSize, descent, ascent, fontSize;

private float x, baseline, width, height, pointSize,
descent, ascent, fontSize;
private String run;
private PDFont font;
private PDColorState strokeColor;
Expand Down

0 comments on commit 02bf718

Please sign in to comment.