Java - XLSX parse & database export - java

I have an excel that filled about 50k-60k rows.
I have to make that excel content uploaded into MySQL, usually I use the apache poi to read and upload it into MySQL, but this file cannot be read using apache poi cause the file was to LARGE.
Can anybody guide me how to do that? Here is my sample code to upload the content into MySQL using apache poi (it works for some little xlsx files that contains 1000-2000 rows)
public static void uploadCrossSellCorpCard(FileItem file, String dbtable) {
System.out.println("UploadUtil Running" + file.getFileName().toString());
try {
for(int i = 0; i<=sheetx.getLastRowNum(); i++){
row = sheetx.getRow(i);
try{
int oc = (int) row.getCell(0).getNumericCellValue();
if((String.valueOf(oc).matches("[A-Za-z0-9]{3}"))){
String rm_name = row.getCell(1).getStringCellValue();
String company = row.getCell(2).getStringCellValue();
String product = row.getCell(3).getStringCellValue();
String detail = row.getCell(4).getStringCellValue();
String type = row.getCell(5).getStringCellValue();
String sql = "INSERT INTO " + dbtable + " VALUES('"
+ oc + "','" + rm_name + "','" + company + "','"
+ product + "','" + detail + "','" + type + "')";
save(sql);
System.out.println("Import rows " + i);
}
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (NullPointerException e) {
System.out.println(e);
}
}
System.out.println("Success import xlsx to mysql table");
} catch (NullPointerException e){
System.out.println(e);
System.out.println("Select the file first before uploading");
}
}
Note: I use hibernate method for handle upload schema.. "save(sql)" is calling my hibernate method

You can try using Apache POI SAX - read the section --> XSSF and SAX (Event API) on https://poi.apache.org/spreadsheet/how-to.html
You can read entire excel with 60k rows or even 100k rows just like reading an xml file. only thing you need to take care is empty cell since xml tag for empty cell will just skip the cell it but you may like to update null value in db table for the cell representing empty value.
Solution --> you can read each row and fire insert statement in a loop. and keep watch on empty cell by monitoring cell address if gap occurs then check respective column name and accordingly update your insert statement with null value.
I hope this helps you. below sample code read excel and store it in ArrayList of ArrayList for tabular representation. I am printing message in console - "new row begins" before start reading and printing row. and cell number of each value before printing cell value itself.
I have not taken care of cell gaps for empty cell but that you can code it based on finding cell gap since in my case I don't have empty cell.
look for cell address in the console that helps you in spotting any gap and handling it as you wish.
Run this code and works fine for me. don't forget to add xmlbeans-2.3.0.jar
other then jars required by import statements.
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ExcelToStringArray implements Cloneable {
public static ArrayList<ArrayList<StringBuilder>> stringArrayToReturn = new ArrayList<ArrayList<StringBuilder>>();
public static ArrayList<StringBuilder> retainedString;
public static Integer lineCounter = 0;
public ArrayList<ArrayList<StringBuilder>> GetSheetInStringArray(String PathtoFilename, String rId)
throws Exception {
ExcelToStringArray myParser = new ExcelToStringArray();
myParser.processOneSheet(PathtoFilename, rId);
return stringArrayToReturn;
}
public void processOneSheet(String PathtoFilename, String rId) throws Exception {
OPCPackage pkg = OPCPackage.open(PathtoFilename);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
InputStream sheet = r.getSheet(rId);
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
private class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
if (name.equals("row")) {
retainedString = new ArrayList<StringBuilder>();
if (retainedString.isEmpty()) {
stringArrayToReturn.add(retainedString);
retainedString.clear();
}
System.out.println("New row begins");
retainedString.add(new StringBuilder(lineCounter.toString()));
lineCounter++;
}
// c => cell
if (name.equals("c")) {
// Print the cell reference
System.out.print(attributes.getValue("r") + " - ");
// System.out.print(attributes.getValue("r") + " - ");
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if (cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name) throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if (nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if (name.equals("v")) {
System.out.println(lastContents);
// value of cell what it string or number
retainedString.add(new StringBuilder(lastContents));
}
}
public void characters(char[] ch, int start, int length) throws SAXException {
lastContents += new String(ch, start, length);
}
}
public static void main(String[] args) throws Exception {
StopWatch watch = new StopWatch();
watch.start();
ExcelToStringArray generate = new ExcelToStringArray();
// rID1 is first sheet in my workbook for rId2 for second sheet and so
// on.
generate.GetSheetInStringArray("D:\\Users\\NIA\\Desktop\\0000_MasterTestSuite.xlsx", "rId10");
watch.stop();
System.out.println(DurationFormatUtils.formatDurationWords(watch.getTime(), true, true));
System.out.println("done");
System.out.println(generate.stringArrayToReturn);
}
}

Related

I am trying to use a url xml parse but it looks like i keep getting an empty xml

I am trying to get info from a weather API called even though when i am making the request i am getting a response, but when i am trying to get only a specific part of the response i get null response every time can someone help? here is the code for my handler :
package weathercalls;
import java.util.ArrayList;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
public class Handler extends DefaultHandler
{
// Create three array lists to store the data
public ArrayList<Integer> lows = new ArrayList<Integer>();
public ArrayList<Integer> highs = new ArrayList<Integer>();
public ArrayList<String> regions = new ArrayList<String>();
// Make sure that the code in DefaultHandler's
// constructor is called:
public Handler()
{
super();
}
/*** Below are the three methods that we are extending ***/
#Override
public void startDocument()
{
System.out.println("Start document");
}
#Override
public void endDocument()
{
System.out.println("End document");
}
// This is where all the work is happening:
#Override
public void startElement(String uri, String name, String qName, Attributes atts)
{
if(qName.compareTo("region") == 0)
{
String region = atts.getLocalName(0);
System.out.println("Day: " + region);
this.regions.add(region);
}
if(qName.compareToIgnoreCase("wind_degree") == 0)
{
int low = atts.getLength();
System.out.println("Low: " + low);
this.lows.add(low);
}
if(qName.compareToIgnoreCase("high") == 0)
{
int high = Integer.parseInt(atts.getValue(0));
System.out.println("High: " + high);
this.highs.add(high);
}
}
}
and here is my main file code :
package weathercalls;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
public class weatherCalls {
public static void main(String[] args) throws Exception {
//Main url
String main_url = "http://api.weatherapi.com/v1/";
//Live or Weekly forecast
String live_weather = "current.xml?key=";
//String sevendays_weather = "orecast.xml?key=";
//API Key + q
String API_Key = "c2e285e55db74def97f151114201701&q=";
//Location Setters
String location = "London";
InputSource inSource = null;
InputStream in = null;
XMLReader xr = null;
/**
URL weather = new URL(main_url + live_weather + API_Key + location);
URLConnection yc = weather.openConnection();
BufferedReader in1 = new BufferedReader(
new InputStreamReader(
yc.getInputStream()));
String inputLine;
while ((inputLine = in1.readLine()) != null)
System.out.println(inputLine);
in1.close();**/
try
{
// Turn the string into a URL object
String complete_url = main_url + live_weather + API_Key + location;
URL urlObject = new URL(complete_url);
// Open the stream (which returns an InputStream):
in = urlObject.openStream();
/** Now parse the data (the stream) that we received back ***/
// Create an XML reader
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
xr = parser.getXMLReader();
// Tell that XML reader to use our special Google Handler
Handler ourSpecialHandler = new Handler();
xr.setContentHandler(ourSpecialHandler);
// We have an InputStream, but let's just wrap it in
// an InputSource (the SAX parser likes it that way)
inSource = new InputSource(in);
// And parse it!
xr.parse(inSource);
System.out.println(complete_url);
System.out.println(urlObject);
System.out.println(in);
System.out.println(xr);
System.out.println(inSource);
System.out.println(parser);
}
catch(IOException ioe)
{
ioe.printStackTrace();
}
catch(SAXException se)
{
se.printStackTrace();
}
}
}
and this is my console print:
Start document
Day: null
Low: 0
End document
http://api.weatherapi.com/v1/current.xml?key=c2e285e55db74def97f151114201701&q=London
http://api.weatherapi.com/v1/current.xml?key=c2e285e55db74def97f151114201701&q=London
sun.net.www.protocol.http.HttpURLConnection$HttpInputStream#2471cca7
com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser#5fe5c6f
org.xml.sax.InputSource#6979e8cb
com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl#763d9750
I think you are trying to extract the values from the XML tags and if it is the case then you are doing it wrong. Attributes object contains the attributes of a particular tag and to get the value you have to do some extra work. Similar to the start of a tag, there are separate events for the contents and the end of a tag. current_tag variable will keep track of the current tag being processed. Below is a sample code:
class Handler extends DefaultHandler {
// Create three array lists to store the data
public ArrayList<Integer> lows = new ArrayList<Integer>();
public ArrayList<Integer> highs = new ArrayList<Integer>();
public ArrayList<String> regions = new ArrayList<String>();
// Make sure that the code in DefaultHandler's
// constructor is called:
public Handler() {
super();
}
/*** Below are the three methods that we are extending ***/
#Override
public void startDocument() {
System.out.println("Start document");
}
#Override
public void endDocument() {
System.out.println("End document");
}
//Keeps track of the current tag;
String currentTag = "";
// This is where all the work is happening:
#Override
public void startElement(String uri, String name, String qName, Attributes atts) {
//Save the current tag being handled
currentTag = qName;
}
//Detect end tag
#Override
public void endElement(String uri, String localName, String qName) throws SAXException {
//Reset it
currentTag = "";
}
#Override
public void characters(char[] ch, int start, int length) throws SAXException {
//Rules based on current tag
switch (currentTag) {
case "region":
String region = String.valueOf(ch, start, length);
this.regions.add(region);
System.out.println("Day: " + region);
break;
case "wind_degree":
int low = Integer.parseInt(String.valueOf(ch, start, length));
System.out.println("Low: " + low);
this.lows.add(low);
break;
case "high":
int high = Integer.parseInt(String.valueOf(ch, start, length));
System.out.println("High: " + high);
this.highs.add(high);
break;
}
}}
NOTE: Please refrain from sharing your API keys or passwords on the internet.

How can I go back to Main method in my code, And depending on Condition?

In this program I am Reading .xlsx file. And adding cell data to vector, if vector size is less-than 12 no need to read remaining data, and i need to go main method.
How can I do in my program ?
This is my Code :
package com.read;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Vector;
import org.apache.poi.openxml4j.opc.OPCPackage;
import java.io.InputStream;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class SendDataToDb {
public static void main(String[] args) {
SendDataToDb sd = new SendDataToDb();
try {
sd.processOneSheet("C:/Users/User/Desktop/New folder/Untitled 2.xlsx");
System.out.println("in Main method");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void processOneSheet(String filename) throws Exception {
System.out.println("executing Process Method");
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
System.out.println("count "+sst.getCount());
XMLReader parser = fetchSheetParser(sst);
// To look up the Sheet Name / Sheet Order / rID,
// you need to process the core Workbook stream.
// Normally it's of the form rId# or rSheet#
InputStream sheet2 = r.getSheet("rId2");
System.out.println("Sheet2");
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
//System.out.println("EXECUTING fetchSheetParser METHOD");
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
System.out.println("Method :fetchSheetParser");
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
Vector values = new Vector(20);
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
//long l = Long.valueOf(attributes.getValue("r"));
if(name.equals("c")){
columnNum++;
}
if(name.equals("c")) {
// Print the cell reference
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
//System.out.println("Method :222222222");
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
values.add(lastContents);
}
if(name.equals("row")) {
System.out.println(values);
//values.setSize(50);
System.out.println(values.size()+" "+values.capacity());
//********************************************************
//I AM CHECKING CONDITION HERE, IF CONDITION IS TRUE I NEED STOP THE REMAINING PROCESS AND GO TO MAIN METHOD.
if(values.size() < 12)
values.removeAllElements();
//WHAT CODE I NEED TO WRITE HERE TO STOP THE EXECUTION OF REMAINING PROCESS AND GO TO MAIN
//***************************************************************
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
//System.out.println("method : 333333333333");
lastContents += new String(ch, start, length);
}
}
}
check the code in between lines of //******************************
and
//******************************************
You can throw a SAXException wherever you want the parsing to stop:
throw new SAXException("<Your message>")
and handle it in the main method.
After your checking, you should throw the Exception to get out from there and get it back to the main method.
throw new Exception("vector size has to be less than 12");

Batch Insert In Cassandra using Apache Spark hanging and context not getting closed when triggered from a Web Ser

I am a new to spark . I am trying to insert csv files into cassandra table using spark-cassandra connector as below:
The files are in Hdfs and I am getting the Paths of all files and for each path I call a method which does converts the csv data to corressponding cassandra data types and creates a prepared statement binds the data to the prepared statement and adds it to a batch. Finally I execute the batch when its 1000.
Key Points
1. I am using Apache Cassandra 2.1.8 and Spark 1.5
2. I read the Csv files using Spark Context
3. I am using the com.datastax.spark.connector.cql.CassandraConnector to create a Session with Cassandra.
I have 9 Files , each files data goes to a table in cassandra. Every Things works fine All inserts are happening as expected and the job gets completed when I submit the jar on spark submit.
The problem I am facing is When the same Jar is invoked through a web-service (web service calls the script to invoke the jar) One of the files data doesn’t get inserted and the spark context doesn’t stop due to which the jobs is forever running.
When I insert 4 files or 5 files everything works fine even through the web-service. But all together it hanging and I get 10 records less in one of the tables and context doesn’t stop.
Its strange because when I am submitting the jar on the spark submit directly everything works fine and through the web service I face this issue , Its strange bcz even the web-service submits the job to the same spark submit.
Here is my code
package com.pz.loadtocassandra;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.ConsoleHandler;
import java.util.logging.FileHandler;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import com.datastax.driver.core.BatchStatement;
import com.datastax.driver.core.BoundStatement;
import com.datastax.driver.core.PreparedStatement;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.exceptions.InvalidTypeException;
import com.datastax.spark.connector.cql.CassandraConnector;
import com.datastax.spark.connector.japi.CassandraRow;
import com.pz.shared.UnicodeBOMInputStream;
import com.pz.shared.fileformat.Header;
import com.pz.shared.mr.fileformat.MRFileFormats.CSVInputFormat;
import com.pz.shared.mr.fileformat.MRFileFormats.TextArrayWritable;
public class LoadToCassandra {
public static final String STUDYID = "STUDYID";
public static final String PROJECTNAME = "PROJECTNAME";
public static final String FILEID = "FILEID";
public static int count = 0;
public static final String FILE_SERPERATOR = "/";
public static Logger log = Logger.getLogger(LoadToCassandra.class.getName());
public static void main(String[] args) {
String propFileLoc = args[0];
String hdfsHome = args[1];
String hdfs_DtdXmlPath = args[2];
String hdfs_NormalizedDataPath = args[3];
run(propFileLoc, hdfsHome, hdfs_DtdXmlPath,hdfs_NormalizedDataPath);
} catch (IOException exception) {
log.log(Level.SEVERE, "Error occur in FileHandler.", exception);
}
}
public static void run(String propFileLoc, String hdfsHome,
String hdfs_DtdXmlPath, String hdfs_NormalizedDataPath) {
JavaSparkContext ctx = null;
FileSystem hadoopFs = null;
try {
PropInitialize.initailizeConfig(propFileLoc);
//setting spark context
ctx = setSparkContext(propFileLoc);
ParseDtdXml.parseDTDXML(hdfsHome, hdfs_DtdXmlPath);
Configuration configuration = setHadoopConf();
hadoopFs = getHadoopFs(hdfsHome, configuration);
FileStatus[] fstat = hadoopFs.listStatus(new Path(hdfs_NormalizedDataPath));
//Getting the csv paths
Path[] paths = FileUtil.stat2Paths(fstat);
log.info("PATH.size - " + paths.length);
for (Path path : paths) {
log.info("path is : "+path.toString());
loadToCassandra(propFileLoc, path, configuration,hdfsHome, ctx);
}
} catch (IOException | URISyntaxException e) {
log.log(Level.SEVERE, "run method", e);
e.printStackTrace();
} finally {
log.info("finally ");
if (ctx!= null) {
ctx.stop();
System.out.println("SC Stopped");
}
if (hadoopFs != null) {
try {
hadoopFs.close();
} catch (IOException e) {
log.log(Level.SEVERE, "run method", e);
}
}
}
}
// input : 1. String hdfs home ,
// 2. Configuration hadoop conf object
// returns : hadoop File System object
private static FileSystem getHadoopFs(String hdfsHome,
Configuration configuration) throws IOException, URISyntaxException {
return FileSystem.get(new URI(hdfsHome), configuration);
}
// input : no inputs
// process : sets hadoop config parameters
// output : retuns hadoop conf object
private static Configuration setHadoopConf() throws IOException,
URISyntaxException {
Configuration configuration = new Configuration();
configuration.setBoolean("csvFileFormat.encoded.flag", true);
configuration.set("csvinputformat.token.delimiter", ",");
return configuration;
}
// input : string Properties File Location
// process : creates and sets the configurations of spark context
// retuns : JavaSparkContext object with configurations set to it.
private static JavaSparkContext setSparkContext(String propFileLoc) {
PropInitialize.initailizeConfig(propFileLoc);
SparkConf conf = new SparkConf();
conf.set("spark.serializer",
"org.apache.spark.serializer.KryoSerializer");
conf.setAppName("Loading Data");
conf.setMaster(PropInitialize.spark_master);
conf.set("spark.cassandra.connection.host",
PropInitialize.cassandra_hostname);
conf.setJars(PropInitialize.external_jars);
return new JavaSparkContext(conf);
}
private static void loadToCassandra(String propFileLoc, Path sourceFileHdfsPath,
Configuration hadoopConf, String hdfsHome,JavaSparkContext ctx) {
System.out.println("File :: " + sourceFileHdfsPath.toString());
FileSystem hadoopFs = null;
PropInitialize.initailizeConfig(propFileLoc);
String cassKeyspaceName = PropInitialize.cass_keyspace_name;
log.info("entered here for file "+sourceFileHdfsPath.toString());
final String strInputFileName = StringUtils.split(
sourceFileHdfsPath.getName(), "#")[0].toLowerCase();
final String strTableNameInCass = StringUtils.split(
sourceFileHdfsPath.getName(), "-")[0].split("#")[1]
.toLowerCase();
final String strSourceFilePath = sourceFileHdfsPath.toString();
try {
hadoopFs = getHadoopFs(hdfsHome, hadoopConf);
//getting the cassandra connection using spark conf
final CassandraConnector connector = getCassandraConnection(ctx);
final JavaRDD<CassandraRow> cassTableObj=getCassTableObj(ctx,cassKeyspaceName,strTableNameInCass);
final Map<String, List<String>> tabColMapWithColTypes1 = ParseDtdXml.tabColMapWithColTypes;
final String headersUpdated;
final String headers;
UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(
hadoopFs.open(sourceFileHdfsPath));
Header CsvHeader = Header.getCSVHeader(ubis, ",");
if (!strTableNameInCass.equalsIgnoreCase("PCMASTER")) {
String fString = "";
for (int i = 0; i < CsvHeader.size() - 1; i++) {
fString = fString + CsvHeader.get(i).ColumnName + ",";
}
fString = fString
+ CsvHeader.get(CsvHeader.size() - 1).ColumnName;
headers = fString; // StringUtils.join(stringArr.toString(),",");
headersUpdated = strTableNameInCass.toUpperCase() + "ID,"
+ headers;
} else {
String[] stringArr = new String[CsvHeader.size()];
String fString = "";
for (int i = 0; i < CsvHeader.size() - 1; i++) {
// stringArr[i] = CsvHeader.get(i).ColumnName;
fString = fString + CsvHeader.get(i).ColumnName + ",";
}
fString = fString
+ CsvHeader.get(CsvHeader.size() - 1).ColumnName;
headers = StringUtils.join(stringArr.toString(), ",");
headersUpdated = fString;
}
ubis.close();
//Reading the file using spark context
JavaPairRDD<LongWritable, TextArrayWritable> fileRdd = ctx
.newAPIHadoopFile(strSourceFilePath, CSVInputFormat.class,
LongWritable.class, TextArrayWritable.class,
hadoopConf);
final long recCount = fileRdd.count();
final String[] actCols = headersUpdated.split(",");
final LinkedHashMap<Object, String> mapOfColNameAndType = new LinkedHashMap<Object, String>();
final List<String> colNameAndType = tabColMapWithColTypes1
.get(strTableNameInCass.toUpperCase());
for (int i = 0; i < actCols.length; i++) {
if (colNameAndType.contains(actCols[i] + " " + "text")) {
int indexOfColName = colNameAndType.indexOf(actCols[i]
+ " " + "text");
mapOfColNameAndType.put(i,
colNameAndType.get(indexOfColName).split(" ")[1]);
} else if (colNameAndType
.contains(actCols[i] + " " + "decimal")) {
int indexOfColName = colNameAndType.indexOf(actCols[i]
+ " " + "decimal");
mapOfColNameAndType.put(i,
colNameAndType.get(indexOfColName).split(" ")[1]);
} else {
continue;
}
}
//creates the query for prepared statement
final String makeStatement = makeSt(cassKeyspaceName,
strTableNameInCass, actCols);
final long seqId1 = cassTableObj.count();
//calling map on the fileRdd
JavaRDD<String> data = fileRdd.values().map(
new Function<TextArrayWritable, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
Session session;
boolean isssession = false;
PreparedStatement statement;
BatchStatement batch;
int lineCount = 0;
long seqId = seqId1;
/*for each line returned as an TextArrayWritable convert each cell the corresponding
* bind the data to prepared statement
* add it to batch
*/
#Override
public String call(TextArrayWritable tup)
throws Exception {
seqId++;
lineCount++;
log.info("entered here 3 for file "+strSourceFilePath.toString());
String[] part = tup.toStrings();
Object[] parts = getDataWithUniqueId(
strTableNameInCass, part);
//For each file
//Creates the session
//creates the PreparedStatement
if (!isssession) {
session = connector.openSession();
statement = session.prepare(makeStatement);
log.info("entered here 4 for file "+strSourceFilePath.toString());
// System.out.println("statement :" +
// statement);
isssession = true;
batch = new BatchStatement();
}
List<Object> typeConvData = new ArrayList<Object>();
for (int i = 0; i < parts.length; i++) {
String type = mapOfColNameAndType.get(i);
try {
if (type.equalsIgnoreCase("text")) {
typeConvData.add(parts[i]);
} else {
// parts[i] =
// parts[i].toString().replace("\"",
// "");
// check if the String the has to
// converted to a BigDecimal is any
// positive or negetive integer or not.
// if its not a positive integer or
// negative forcefully convert it to
// zero (avoiding NumberFormatException)
if (!((String) parts[i])
.matches("-?\\d+")) {
parts[i] = "0";
}
long s = Long
.valueOf((String) parts[i]);
typeConvData.add(BigDecimal.valueOf(s));
}
} catch (NullPointerException e) {
log.log(Level.SEVERE, "loadToCass method", e);
} catch (NumberFormatException e) {
log.log(Level.SEVERE, "loadToCass method", e);
} catch (InvalidTypeException e) {
log.log(Level.SEVERE, "loadToCass method", e);
}
}
List<Object> data = typeConvData;
//bind data to query
final BoundStatement query = statement.bind(data
.toArray(new Object[data.size()]));
//add query to batch
batch.add(query);
int count = LoadToCassandra.count;
//when count is 1k execute batch
if (count == 1000) {
log.info("entered here 5 for file "+strSourceFilePath.toString());
log.info("batch done");
session.execute(batch);
LoadToCassandra.count = 0;
batch = new BatchStatement();
return StringUtils.join(tup.toStrings());
}
//if its the last batch and its not of size 1k
if (lineCount == (recCount))
{
log.info("Last Batch");
session.executeAsync(batch);
log.info("entered here 6 for file "+strSourceFilePath.toString());
//session.execute(batch);
session.close();
log.info("Session closed");
}
LoadToCassandra.count++;
return StringUtils.join(tup.toStrings());
}
private Object[] getDataWithUniqueId(
String strTableNameInCass, String[] part) {
Object[] parts = null;
ArrayList<String> tempArraylist = new ArrayList<String>();
if (!strTableNameInCass
.equalsIgnoreCase("PCMASTER")) {
for (int i = 0; i < part.length; i++) {
if (i == 0) {
tempArraylist.add(0,
String.valueOf(seqId));
}
tempArraylist.add(part[i]);
}
parts = tempArraylist.toArray();
} else {
parts = part;
}
return parts;
}
});
data.count();
hadoopFs.close();
} catch (Exception e) {
e.printStackTrace();
}
}
private static JavaRDD<CassandraRow> getCassTableObj(
JavaSparkContext ctx, String cassKeyspaceName,
String strTableNameInCass) {
return javaFunctions(ctx)
.cassandraTable(cassKeyspaceName,
strTableNameInCass.toLowerCase());
}
private static CassandraConnector getCassandraConnection(
JavaSparkContext ctx) {
return CassandraConnector.apply(ctx.getConf());
}
private static String makeSt(String keyspace, String tabName,
String[] colNames) {
StringBuilder sb = new StringBuilder();
sb.append("INSERT INTO " + keyspace + "." + tabName + " ( ");
List<String> vars = new ArrayList<>();
for (int i = 0; i < (colNames.length - 1); i++) {
sb.append(colNames[i] + ",");
vars.add("?");
}
vars.add("?");
sb.append(colNames[colNames.length - 1] + " ) values ( "
+ StringUtils.join(vars, ",") + " ) ");
return sb.toString();
}}
Can anyone tell me what could the reason that causes this problem and how can it be resolved. Thanks
Once you inserted your data into cassandra, call ctx.stop() method, it will stop spark context.

Low memory writing/reading with Apache POI

I'm trying to write a pretty large XLSX file (4M+ cells) and I'm having some memory issues.
I can't use SXSSF since I also need to read the existing cells in the template.
Is there anything I can do to reduce the memory footprint?
Perhaps combine streaming reading and streaming writing?
To handle large data with low memory, the best and I think the only option is SXSSF api-s.
If you need to read some data of the existing cells, I assume you do not need the entire 4M+ at the same time.
In such a case based on your application requirement, you can handle the window size yourself and keep in memory only the amount of data you need at a particular time.
You can start by looking at the example at :
http://poi.apache.org/spreadsheet/how-to.html#sxssf
Something as
SXSSFWorkbook wb = new SXSSFWorkbook(-1); // turn off auto-flushing and accumulate all rows in memory
// manually control how rows are flushed to disk
if(rownum % NOR == 0) {
((SXSSFSheet)sh).flushRows(NOR); // retain NOR last rows and flush all others
Hope this helps.
I used SAX parser to process events of the XML document presentation. This is
import com.sun.org.apache.xerces.internal.parsers.SAXParser;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
public class LowMemoryExcelFileReader {
private String file;
public LowMemoryExcelFileReader(String file) {
this.file = file;
}
public List<String[]> read() {
try {
return processFirstSheet(file);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private List<String []> readSheet(Sheet sheet) {
List<String []> res = new LinkedList<>();
Iterator<Row> rowIterator = sheet.rowIterator();
while (rowIterator.hasNext()) {
Row row = rowIterator.next();
int cellsNumber = row.getLastCellNum();
String [] cellsValues = new String[cellsNumber];
Iterator<Cell> cellIterator = row.cellIterator();
int cellIndex = 0;
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
cellsValues[cellIndex++] = cell.getStringCellValue();
}
res.add(cellsValues);
}
return res;
}
public String getFile() {
return file;
}
public void setFile(String file) {
this.file = file;
}
private List<String []> processFirstSheet(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename, PackageAccess.READ);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
SheetHandler handler = new SheetHandler(sst);
XMLReader parser = fetchSheetParser(handler);
Iterator<InputStream> sheetIterator = r.getSheetsData();
if (!sheetIterator.hasNext()) {
return Collections.emptyList();
}
InputStream sheetInputStream = sheetIterator.next();
BufferedInputStream bisSheet = new BufferedInputStream(sheetInputStream);
InputSource sheetSource = new InputSource(bisSheet);
parser.parse(sheetSource);
List<String []> res = handler.getRowCache();
bisSheet.close();
return res;
}
public XMLReader fetchSheetParser(ContentHandler handler) throws SAXException {
XMLReader parser = new SAXParser();
parser.setContentHandler(handler);
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private static class SheetHandler extends DefaultHandler {
private static final String ROW_EVENT = "row";
private static final String CELL_EVENT = "c";
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private List<String> cellCache = new LinkedList<>();
private List<String[]> rowCache = new LinkedList<>();
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if (CELL_EVENT.equals(name)) {
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
} else if (ROW_EVENT.equals(name)) {
if (!cellCache.isEmpty()) {
rowCache.add(cellCache.toArray(new String[cellCache.size()]));
}
cellCache.clear();
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
cellCache.add(lastContents);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
public List<String[]> getRowCache() {
return rowCache;
}
}
}

Using PDFbox to determine the coordinates of words in a document

I'm using PDFbox to extract the coordinates of words/strings in a PDF document, and have so far had success determining the position of individual characters. this is the code thus far, from the PDFbox doc:
package printtextlocations;
import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.List;
public class PrintTextLocations extends PDFTextStripper {
public PrintTextLocations() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\path\\to\\PDF.pdf");
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
System.out.println("Processing page: " + i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* #param text The text to be processed
*/
#Override /* this is questionable, not sure if needed... */
protected void processTextPosition(TextPosition text) {
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
}
}
This produces a series of lines containing the position of each character, including spaces, that looks like this:
String[202.5604,41.880127 fs=1.0 xscale=13.98 height=9.68814 space=3.8864403 width=9.324661]P
Where 'P' is the character. I have not been able to find a function in PDFbox to find words, and I am not familiar enough with Java to be able to accurately concatenate these characters back into words to search through even though the spaces are also included. Has anyone else been in a similar situation, and if so how did you approach it? I really only need the coordinate of the first character in the word so that parts simplified, but as to how I'm going to match a string against that kind of output is beyond me.
There is no function in PDFBox that allows you to extract words automatically. I'm currently working on extracting data to gather it into blocks and here is my process:
I extract all the characters of the document (called glyphs) and store them in a list.
I do an analysis of the coordinates of each glyph, looping over the list. If they overlap (if the top of the current glyph is contained between the top and bottom of the preceding/or the bottom of the current glyph is contained between the top and bottom of the preceding one), I add it to the same line.
At this point, I have extracted the different lines of the document (be careful, if your document is multi-column, the expression "lines" means all the glyphs that overlap vertically, ie the text of all the columns that have the same vertical coordinates).
Then, you can compare the left coordinate of the current glyph to the right coordinate of the preceding one to determine if they belong to the same word or not (the PDFTextStripper class provides a getSpacingTolerance() method that gives you, based on trials and errors, the value of a "normal" space. If the difference between the right and the left coordinates is lower than this value, both glyphs belong to the same word.
I applied this method to my work and it works good.
Based on the original idea here is a version of the text search for PDFBox 2. The code itself is rough, but simple. It should get you started fairly quickly.
import java.io.IOException;
import java.io.Writer;
import java.util.List;
import java.util.Set;
import lu.abac.pdfclient.data.PDFTextLocation;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class PrintTextLocator extends PDFTextStripper {
private final Set<PDFTextLocation> locations;
public PrintTextLocator(PDDocument document, Set<PDFTextLocation> locations) throws IOException {
super.setSortByPosition(true);
this.document = document;
this.locations = locations;
this.output = new Writer() {
#Override
public void write(char[] cbuf, int off, int len) throws IOException {
}
#Override
public void flush() throws IOException {
}
#Override
public void close() throws IOException {
}
};
}
public Set<PDFTextLocation> doSearch() throws IOException {
processPages(document.getDocumentCatalog().getPages());
return locations;
}
#Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
super.writeString(text);
String searchText = text.toLowerCase();
for (PDFTextLocation textLoc:locations) {
int start = searchText.indexOf(textLoc.getText().toLowerCase());
if (start!=-1) {
// found
TextPosition pos = textPositions.get(start);
textLoc.setFound(true);
textLoc.setPage(getCurrentPageNo());
textLoc.setX(pos.getXDirAdj());
textLoc.setY(pos.getYDirAdj());
}
}
}
}
take a look on this, I think it's what you need.
https://jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/
Here is the code:
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
public class PrintTextLocations extends PDFTextStripper {
public static StringBuilder tWord = new StringBuilder();
public static String seek;
public static String[] seekA;
public static List wordList = new ArrayList();
public static boolean is1stChar = true;
public static boolean lineMatch;
public static int pageNo = 1;
public static double lastYVal;
public PrintTextLocations()
throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args)
throws Exception {
PDDocument document = null;
seekA = args[1].split(",");
seek = args[1];
try {
File input = new File(args[0]);
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
pageNo += 1;
}
} finally {
if (document != null) {
System.out.println(wordList);
document.close();
}
}
}
#Override
protected void processTextPosition(TextPosition text) {
String tChar = text.getCharacter();
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
String REGEX = "[,.\\[\\](:;!?)/]";
char c = tChar.charAt(0);
lineMatch = matchCharLine(text);
if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) {
if ((!is1stChar) && (lineMatch == true)) {
appendChar(tChar);
} else if (is1stChar == true) {
setWordCoord(text, tChar);
}
} else {
endWord();
}
}
protected void appendChar(String tChar) {
tWord.append(tChar);
is1stChar = false;
}
protected void setWordCoord(TextPosition text, String tChar) {
tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar);
is1stChar = false;
}
protected void endWord() {
String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", "");
String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1);
if (!"".equals(sWord)) {
if (Arrays.asList(seekA).contains(sWord)) {
wordList.add(newWord);
} else if ("SHOWMETHEMONEY".equals(seek)) {
wordList.add(newWord);
}
}
tWord.delete(0, tWord.length());
is1stChar = true;
}
protected boolean matchCharLine(TextPosition text) {
Double yVal = roundVal(Float.valueOf(text.getYDirAdj()));
if (yVal.doubleValue() == lastYVal) {
return true;
}
lastYVal = yVal.doubleValue();
endWord();
return false;
}
protected Double roundVal(Float yVal) {
DecimalFormat rounded = new DecimalFormat("0.0'0'");
Double yValDub = new Double(rounded.format(yVal));
return yValDub;
}
}
Dependencies:
PDFBox,
FontBox,
Apache Common Logging Interface.
You can run it by typing on command line:
javac PrintTextLocations.java
sudo java PrintTextLocations file.pdf WORD1,WORD2,....
the output is similar to:
[(1)[190.3 : 286.8] WORD1, (1)[283.3 : 286.8] WORD2, ...]
For those who still need assistance, this is what I used in my code and should be useful to start with. It uses PDFBox 2.0.16
public class PDFTextLocator extends PDFTextStripper {
private static String key_string;
private static float x;
private static float y;
public PDFTextLocator() throws IOException {
x = -1;
y = -1;
}
/**
* Takes in a PDF Document, phrase to find, and page to search and returns the x,y in float array
* #param document
* #param phrase
* #param page
* #return
* #throws IOException
*/
public static float[] getCoordiantes(PDDocument document, String phrase, int page) throws IOException {
key_string = phrase;
PDFTextStripper stripper = new PDFTextLocator();
stripper.setSortByPosition(true);
stripper.setStartPage(page);
stripper.setEndPage(page);
stripper.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream()));
y = document.getPage(page).getMediaBox().getHeight()-y;
return new float[]{x,y};
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
if(string.contains(key_string)) {
TextPosition text = textPositions.get(0);
if(x == -1) {
x = text.getXDirAdj();
y = text.getYDirAdj();
}
}
}
}
Below is the Maven dependency details,
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.16</version>
</dependency>
I got this working using the IKVM conversion PDFBox.NET 1.8.9. in C# and .NET.
I finally figured out the character (glyph) coordinates are private to the .NET assembly, but can be accessed using System.Reflection.
I posted a full example of getting the coordinates of WORDS and drawing them back on images of PDF's using SVG and HTML here: https://github.com/tsamop/PDF_Interpreter
For the example below you need PDFbox.NET: http://www.squarepdf.net/pdfbox-in-net, and include references to it in your project.
It took me quite a while to figure it out, so I really hope it saves someone else time!!
If you just need to know where to look for the characters & coordinates, a very abridged version would be:
using System;
using System.Reflection;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;
// to test run pdfTest.RunTest(#"C:\temp\test_2.pdf");
class pdfTest
{
//simple example for getting character (gliph) coordinates out of a pdf doc.
// a more complete example is here: https://github.com/tsamop/PDF_Interpreter
public static void RunTest(string sFilename)
{
//probably a better way to get page count, but I cut this out of a bigger project.
PDDocument oDoc = PDDocument.load(sFilename);
object[] oPages = oDoc.getDocumentCatalog().getAllPages().toArray();
int iPageNo = 0; //1's based!!
foreach (object oPage in oPages)
{
iPageNo++;
//feed the stripper a page.
PDFTextStripper tStripper = new PDFTextStripper();
tStripper.setStartPage(iPageNo);
tStripper.setEndPage(iPageNo);
tStripper.getText(oDoc);
//This gets the "charactersByArticle" private object in PDF Box.
FieldInfo charactersByArticleInfo = typeof(PDFTextStripper).GetField("charactersByArticle", BIndingFlags.NonPublic | BindingFlags.Instance);
object charactersByArticle = charactersByArticleInfo.GetValue(tStripper);
object[] aoArticles = (object[])charactersByArticle.GetField("elementData");
foreach (object oArticle in aoArticles)
{
if (oArticle != null)
{
//THE CHARACTERS within the article
object[] aoCharacters = (object[])oArticle.GetField("elementData");
foreach (object oChar in aoCharacters)
{
/*properties I caulght using reflection:
* endX, endY, font, fontSize, fontSizePt, maxTextHeight, pageHeight, pageWidth, rot, str textPos, unicodCP, widthOfSpace, widths, wordSpacing, x, y
*
*/
if (oChar != null)
{
//this is a really quick test.
// for a more complete solution that pulls the characters into words and displays the word positions on the page, try this: https://github.com/tsamop/PDF_Interpreter
//the Y's appear to be the bottom of the char?
double mfMaxTextHeight = Convert.ToDouble(oChar.GetField("maxTextHeight")); //I think this is the height of the character/word
char mcThisChar = oChar.GetField("str").ToString().ToCharArray()[0];
double mfX = Convert.ToDouble(oChar.GetField("x"));
double mfY = Convert.ToDouble(oChar.GetField("y")) - mfMaxTextHeight;
//CALCULATE THE OTHER SIDE OF THE GLIPH
double mfWidth0 = ((Single[])oChar.GetField("widths"))[0];
double mfXend = mfX + mfWidth0; // Convert.ToDouble(oChar.GetField("endX"));
//CALCULATE THE BOTTOM OF THE GLIPH.
double mfYend = mfY + mfMaxTextHeight; // Convert.ToDouble(oChar.GetField("endY"));
double mfPageHeight = Convert.ToDouble(oChar.GetField("pageHeight"));
double mfPageWidth = Convert.ToDouble(oChar.GetField("pageWidth"));
System.Diagnostics.Debug.Print(#"add some stuff to test {0}, {1}, {2}", mcThisChar, mfX, mfY);
}
}
}
}
}
}
}
using System.Reflection;
/// <summary>
/// To deal with the Java interface hiding necessary properties! ~mwr
/// </summary>
public static class GetField_Extension
{
public static object GetField(this object randomPDFboxObject, string sFieldName)
{
FieldInfo itemInfo = randomPDFboxObject.GetType().GetField(sFieldName, BindingFlags.NonPublic | BindingFlags.Instance);
return itemInfo.GetValue(randomPDFboxObject);
}
}

Categories