Automatic Hibernate index creation too long - java

I am back with a bug/problem that came to sunlight now. Usually I test the local development and changes on an H2DB but as I know, this has to work on Oracle and MSSQL too.
Now testing on oracle again this problem occurred:
The Key COR_VIEWSETTINGSCOR_USERSETTINGS_FK0 and COR_VIEWSETTINGSCOR_USERSETTINGS_FK1 are generated automatic and are way too long for an oracle db.
To know how these keys are created I will now show you the entities UserSettings and UserViewSettings.
hint: you can overlook the entities and go further to the edits if they confuse you. maybe you can still help me.
UserSettings
/**
The Class UserSettings.
*/
#org.hibernate.envers.Audited
#DataObject( value = UserSettings.DATA_OBJECT_NAME )
#CRUDDefinition( supportsRead = true, supportsCreate = true, supportsUpdate = true, supportsDelete = true )
#Entity( name = UserSettings.DATA_OBJECT_NAME )
#NamedQuery( name = UserSettings.DATA_OBJECT_NAME, query = "from userSettings e where e.name = :name" )
#javax.persistence.Inheritance( strategy = javax.persistence.InheritanceType.TABLE_PER_CLASS )
#AttributeOverrides( { #AttributeOverride( name = "id", column = #Column( name = "USERSETTINGS_ID" ) )
} )
#Table( name = "COR_USERSETTINGS", indexes = {
#javax.persistence.Index( name="COR_USERSETTINGS_FK0", columnList = "SETTINGSTYPE_ID" ),
#javax.persistence.Index( name="COR_USERSETTINGS_FK1", columnList = "USER_ID" ),
}
)
public class UserSettings extends NamedRevisionEntity implements NameSettingsType, NameSettings
{
/** The Constant serialVersionUID. */
private static final long serialVersionUID = 1L;
/** The Constant DATA_OBJECT_NAME. */
public static final String DATA_OBJECT_NAME = "userSettings";
#javax.persistence.Basic( fetch = javax.persistence.FetchType.EAGER, optional = false )
#Column( name = "SETTINGS", nullable = false, unique = false, insertable = true, updatable = true )
#javax.persistence.Lob
private java.lang.String settings;
#javax.persistence.ManyToOne( fetch = javax.persistence.FetchType.EAGER, optional = false )
#javax.persistence.JoinColumn( name = "SETTINGSTYPE_ID", nullable = false, unique = false, insertable = true, updatable = true )
private SettingsType settingsType;
#javax.persistence.ManyToOne( fetch = javax.persistence.FetchType.EAGER, optional = true )
#javax.persistence.JoinColumn( name = "USER_ID", nullable = true, unique = false, insertable = true, updatable = true )
private User user;
public SettingsType getSettingsType()
{
return settingsType;
}
public void setSettingsType( SettingsType settingsType )
{
this.settingsType = settingsType;
}
public User getUser()
{
return user;
}
public void setUser( User user )
{
this.user = user;
}
public java.lang.String getSettings()
{
return settings;
}
public void setSettings( java.lang.String settings )
{
this.settings = settings;
}
#Override
public String getDataObjectName()
{
return DATA_OBJECT_NAME;
}
#Override
public String toString()
{
StringBuilder builder = new StringBuilder( super.toString() );
builder.append( ", " );
try
{
builder.append( ToStringUtils.referenceToString( "settingsType", "SettingsType", this.settingsType ) );
}
catch( Exception ex )
{
builder.append( ex.getClass().getName() );
builder.append( ": " );
builder.append( ex.getMessage() );
}
builder.append( ", " );
try
{
builder.append( ToStringUtils.referenceToString( "user", "User", this.user ) );
}
catch( Exception ex )
{
builder.append( ex.getClass().getName() );
builder.append( ": " );
builder.append( ex.getMessage() );
}
builder.append( "]" );
return builder.toString();
}
}
UserViewSettings
/**
The Class UserViewSettings.
*/
#org.hibernate.envers.Audited
#DataObject( value = UserViewSettings.DATA_OBJECT_NAME )
#CRUDDefinition( supportsRead = true, supportsCreate = true, supportsUpdate = true, supportsDelete = true )
#Entity( name = UserViewSettings.DATA_OBJECT_NAME )
#AttributeOverrides( { #AttributeOverride( name = "id", column = #Column( name = "VIEWSETTINGS_ID" ) )
} )
#Table( name = "COR_VIEWSETTINGS", uniqueConstraints = {
#javax.persistence.UniqueConstraint( name="COR_VIEWSETTINGS_UNQ1", columnNames = { "NAME", "SETTINGSTYPE_ID", "VIEW_NAME", "VIEWTYPE_ID" } ),
}
, indexes = {
#javax.persistence.Index( name="COR_VIEWSETTINGS_FK0", columnList = "VIEWTYPE_ID" ),
}
)
public class UserViewSettings extends UserSettings implements NameViewName, NameViewType
{
/** The Constant serialVersionUID. */
private static final long serialVersionUID = 1L;
/** The Constant DATA_OBJECT_NAME. */
public static final String DATA_OBJECT_NAME = "userViewSettings";
#javax.persistence.Basic( fetch = javax.persistence.FetchType.EAGER, optional = false )
#Column( name = "VIEW_NAME", nullable = false, unique = false, insertable = true, updatable = true )
private java.lang.String viewName;
#javax.persistence.ManyToOne( fetch = javax.persistence.FetchType.EAGER, optional = true )
#javax.persistence.JoinColumn( name = "VIEWTYPE_ID", nullable = true, unique = false, insertable = true, updatable = true )
private ViewType viewType;
public java.lang.String getViewName()
{
return viewName;
}
public void setViewName( java.lang.String viewName )
{
this.viewName = viewName;
}
public ViewType getViewType()
{
return viewType;
}
public void setViewType( ViewType viewType )
{
this.viewType = viewType;
}
#Override
public String getDataObjectName()
{
return DATA_OBJECT_NAME;
}
#Override
public String toString()
{
StringBuilder builder = new StringBuilder( super.toString() );
builder.append( ", " );
builder.append( "viewName" );
builder.append( "=" );
builder.append( this.viewName );
builder.append( ", " );
try
{
builder.append( ToStringUtils.referenceToString( "viewType", "ViewType", this.viewType ) );
}
catch( Exception ex )
{
builder.append( ex.getClass().getName() );
builder.append( ": " );
builder.append( ex.getMessage() );
}
builder.append( "]" );
return builder.toString();
}
}
Starting Wildfly 10.0.0 with Hibernate 5.2 and an Oracle 11 Database then results in the error that the automatic generated Keys COR_VIEWSETTINGSCOR_USERSETTINGS_FK0 and COR_VIEWSETTINGSCOR_USERSETTINGS_FK1 are naturally too long for the database.
I took a look at the NamingStrategies for Hibernate and even tried some but they didn't change the error for me.
How can I impact the generation of these keys?
EDIT:
So turning on DEBUG gave me this:
2016-11-29 09:22:03,190 DEBUG [org.hibernate.SQL] (ServerService Thread Pool -- 58) create index COR_USERSETTINGS_FK0 on COR_USERSETTINGS (SETTINGSTYPE_ID)
2016-11-29 09:22:03,190 DEBUG [org.hibernate.SQL] (ServerService Thread Pool -- 58) create index COR_USERSETTINGS_FK1 on COR_USERSETTINGS (USER_ID)
2016-11-29 09:22:03,190 DEBUG [org.hibernate.SQL] (ServerService Thread Pool -- 58) create index COR_VIEWSETTINGSCOR_USERSETTINGS_FK0 on COR_VIEWSETT INGS(SETTINGSTYPE_ID)
2016-11-29 09:22:03,190 DEBUG [org.hibernate.SQL] (ServerService Thread Pool -- 58) create index COR_VIEWSETTINGSCOR_USERSETTINGS_FK1 on COR_VIEWSETTINGS (USER_ID)
2016-11-29 09:22:03,190 DEBUG [org.hibernate.SQL] (ServerService Thread Pool -- 58) create index COR_VIEWSETTINGS_FK0 on COR_VIEWSETTINGS (VIEWTYPE_ID)
Now I found the Class ImplicitIndexNameSource in the package org.hibernate.boot.model.naming but the internet doesn't really give examples what I can do with this and it seems to be an empty class for a long since a long time.
EDIT 2:
The previous edit seems to be a wrong path. I found the place where these logs are created. It's StandardIndexExporter which gets called from SchemaCreatorImpl. So I need to dig even deeper into the framework but if somebody sees this. Is this the right path? Can I modify code so that He will do the thing I want? It seems to be the hbm2ddl that is the culprit since the index get's created in StandardIndexExport in these lines:
final String indexNameForCreation;
if ( dialect.qualifyIndexName() ) {
indexNameForCreation = jdbcEnvironment.getQualifiedObjectNameFormatter().format(
new QualifiedNameImpl(
index.getTable().getQualifiedTableName().getCatalogName(),
index.getTable().getQualifiedTableName().getSchemaName(),
jdbcEnvironment.getIdentifierHelper().toIdentifier( index.getName() )
),
jdbcEnvironment.getDialect()
);
}
else {
indexNameForCreation = index.getName();
}
final StringBuilder buf = new StringBuilder()
.append( "create index " )
.append( indexNameForCreation )
.append( " on " )
.append( tableName )
.append( " (" );
boolean first = true;
Iterator<Column> columnItr = index.getColumnIterator();
while ( columnItr.hasNext() ) {
final Column column = columnItr.next();
if ( first ) {
first = false;
}
else {
buf.append( ", " );
}
buf.append( ( column.getQuotedName( dialect ) ) );
}
buf.append( ")" );
return new String[] { buf.toString() };
I would appreciate help a lot. This is getting really frustrating

So I got it working.
Answering for future people that might find this and have the same issue.
The index key gets created by the dialect of oracle that hibernate is referrencing to. So what had to be done was implementing an custom OracleDialect that overrides the method getIndexExporter and points to the custom IndexExporter. In this IndexExporter you can then modify the way the keys are created. In my case I fixed the solution like this:
/**
* Gets the correct index name if it is a index for a TABLE_PER_CLASS inheritance and longer than
* 30 chars.
*
* #param index the index to decide for
* #return the correct index name
*/
private String getCorrectIndexName( Index index )
{
if ( index.getTable() instanceof DenormalizedTable && index.getName().length() > 30 )
{
String prefixedTable = index.getTable().getName();
String tableName = prefixedTable.substring( prefixedTable.indexOf( '_' ) + 1, prefixedTable.length() );
tableName = shortenName( tableName );
Iterator<Column> columnItr = index.getColumnIterator();
String reference;
if ( columnItr.hasNext() )
{
reference = extractReference( columnItr.next() );
}
else
{
/** backup strategy to prevent exceptions */
reference = shortenName( NamingHelper.INSTANCE.hashedName( index.getName() ) );
}
return tableName + "_" + reference;
}
return index.getName();
}
/**
* Extract the reference column of the index and hash the full name before shortening it with
* shortenName().
*
* #param index the index to extract the reference from.
* #return the reference with an appended _FK(hashedReference).
*/
private String extractReference( Column column )
{
String reference = column.getQuotedName( dialect );
String md5Hash = NamingHelper.INSTANCE.hashedName( reference );
md5Hash = md5Hash.substring( md5Hash.length() - 4, md5Hash.length() );
reference = shortenName( reference );
return reference + "_FK" + md5Hash;
}
/**
* Shorten the name to a maximum of 11 chars if it's longer.
*
* #param reference the reference to shorten
* #return the shortened string
*/
private static String shortenName( String reference )
{
if ( reference.length() > 11 )
{
return reference.substring( 0, 11 );
}
return reference;
}
this had to be called in the Overriden function getSqlCreateStrings. the changed lines look like this:
String indexName = getCorrectIndexName( index );
indexNameForCreation = jdbcEnvironment.getQualifiedObjectNameFormatter()
.format(
new QualifiedNameImpl( index.getTable().getQualifiedTableName().getCatalogName(),
index.getTable().getQualifiedTableName().getSchemaName(), jdbcEnvironment.getIdentifierHelper().toIdentifier( indexName ) ),
jdbcEnvironment.getDialect() );
I hope that helps someone.

Related

mapstruct does not set relationship properly on a bi-directional OneToMany

I have a JPA one to many bi-directional association. In my code i set the relationship on both side. But the generated mapstruct code seems not setting the relationship properly. I mean it is setting on one side.
I pasted part of my code. The line which i commented is added manually by me.
It should have been generated by mapstruct
derivativeFuture.setDerivativeExecutions( derivativeExecutionDTOSetToDerivativeExecutionSet( derivativeDTO.getDerivativeExecutions() ) );
//derivativeFuture.getDerivativeExecutions().forEach(derivativeExecution -> { derivativeExecution.setDerivative(derivativeFuture); });
protected Set<DerivativeExecution> derivativeExecutionDTOSetToDerivativeExecutionSet(Set<DerivativeExecutionDTO> set) {
if ( set == null ) {
return null;
}
Set<DerivativeExecution> set1 = new HashSet<DerivativeExecution>( Math.max( (int) ( set.size() / .75f ) + 1, 16 ) );
for ( DerivativeExecutionDTO derivativeExecutionDTO : set ) {
set1.add( derivativeExecutionDTOToDerivativeExecution( derivativeExecutionDTO ) );
}
return set1;
}
protected DerivativeExecution derivativeExecutionDTOToDerivativeExecution(DerivativeExecutionDTO derivativeExecutionDTO) {
if ( derivativeExecutionDTO == null ) {
return null;
}
DerivativeExecution derivativeExecution = new DerivativeExecution();
derivativeExecution.setPhysicalQuantity( derivativeExecutionDTO.getPhysicalQuantity() );
derivativeExecution.setExchangeQuantity( derivativeExecutionDTO.getExchangeQuantity() );
derivativeExecution.setPurchaseSaleIndicator( derivativeExecutionDTO.getPurchaseSaleIndicator() );
derivativeExecution.setQuotePricingStartDate( derivativeExecutionDTO.getQuotePricingStartDate() );
derivativeExecution.setQuotePricingEndDate( derivativeExecutionDTO.getQuotePricingEndDate() );
derivativeExecution.setContractExecutionId( derivativeExecutionDTO.getContractExecutionId() );
return derivativeExecution;
}
There are 2 options: adder_prefered : http://mapstruct.org/documentation/stable/reference/html/#collection-mapping-strategies or using a context: https://github.com/mapstruct/mapstruct-examples/tree/master/mapstruct-jpa-child-parent.

Spring Data JPA BigList insert

I've been trying for two days now to store an array list with about six million entries in my Postgres database with Spring-Data-JPA.
The whole thing works, but it's very slow. I need about 27 minutes for everything.
I've already played around with the batch size, but that didn't bring much success. I also noticed that saving takes longer and longer the bigger the table gets. Is there a way to speed it up ?
I've done the whole thing with SQLite before, there I only needed about 15 seconds for the same amount.
My Entity
#Data
#Entity
#Table(name = "commodity_prices")
public class CommodityPrice {
#Id
#Column( name = "id" )
#GeneratedValue( strategy = GenerationType.SEQUENCE )
private long id;
#Column(name = "station_id")
private int station_id;
#Column(name = "commodity_id")
private int commodity_id;
#Column(name = "supply")
private long supply;
#Column(name = "buy_price")
private int buy_price;
#Column(name = "sell_price")
private int sell_price;
#Column(name = "demand")
private long demand;
#Column(name = "collected_at")
private long collected_at;
public CommodityPrice( int station_id, int commodity_id, long supply, int buy_price, int sell_price, long demand,
long collected_at ) {
this.station_id = station_id;
this.commodity_id = commodity_id;
this.supply = supply;
this.buy_price = buy_price;
this.sell_price = sell_price;
this.demand = demand;
this.collected_at = collected_at;
}
}
My insert Class
#Slf4j
#Component
public class CommodityPriceHandler {
#Autowired
CommodityPriceRepository commodityPriceRepository;
#Autowired
private EntityManager entityManager;
public void inserIntoDB() {
int lineCount = 0;
List<CommodityPrice> commodityPrices = new ArrayList<>( );
StopWatch stopWatch = new StopWatch();
stopWatch.start();
try {
Reader reader = new FileReader( DOWNLOAD_SAVE_PATH + FILE_NAME_COMMODITY_PRICES );
Iterable<CSVRecord> records = CSVFormat.EXCEL.withFirstRecordAsHeader().parse( reader );
for( CSVRecord record : records ) {
int station_id = Integer.parseInt( record.get( "station_id" ) );
int commodity_id = Integer.parseInt( record.get( "commodity_id" ) );
long supply = Long.parseLong( record.get( "supply" ) );
int buy_price = Integer.parseInt( record.get( "buy_price" ) );
int sell_price = Integer.parseInt( record.get( "sell_price" ) );
long demand = Long.parseLong( record.get( "demand" ) );
long collected_at = Long.parseLong( record.get( "collected_at" ) );
CommodityPrice commodityPrice = new CommodityPrice(station_id, commodity_id, supply, buy_price, sell_price, demand, collected_at);
commodityPrices.add( commodityPrice );
if (commodityPrices.size() == 1000){
commodityPriceRepository.saveAll( commodityPrices );
commodityPriceRepository.flush();
entityManager.clear();
commodityPrices.clear();
System.out.println(lineCount);
}
lineCount ++;
}
}
catch( IOException e ) {
log.error( e.getLocalizedMessage() );
}
commodityPriceRepository.saveAll( commodityPrices );
stopWatch.stop();
log.info( "Successfully inserted " + lineCount + " lines in " + stopWatch.getTotalTimeSeconds() + " seconds." );
}
}
My application.properties
# HIBERNATE
spring.jpa.properties.hibernate.dialect=org.hibernate.dialect.PostgreSQLDialect
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
spring.jpa.hibernate.ddl-auto = update
spring.jpa.properties.hibernate.jdbc.batch_size=1000
spring.jpa.properties.hibernate.order_inserts=true
While you are doing your insert in batch, your sequence generation strategy still requires you to issue one statement for each record you insert. Thus, for a batch size of 1000 records you issue 1001 statements, which is clearly not what is expected.
My recommendations:
enable sql logging to see what statements are sent to your db. I personally use datasource-proxy, but use anything you are happy with.
modify your sequence generator. At a minimum, use
#Id
#Column( name = "id" )
#GeneratedValue(generator = "com_pr_generator", strategy = GenerationType.SEQUENCE )
#SequenceGenerator(name="com_pr_generator", sequenceName = "book_seq", allocationSize=50)
private long id;
Read about different generation strateges and fine tune your sequence generator.
A beginner’s guide to Hibernate enhanced identifier generators
Hibernate pooled and pooled-lo identifier generators

Writing a Jagged Array in HDF5 using the Java Native Library

I have tried numerous ways and followed some of the examples that are scattered around the web on how to write a jagged array (an array of arrays that may be of differing lengths) in HDF5.
Most of the examples are in C and rather low-level. Anyhow I can't seem to get it working and I just looked at the C-source code and it pretty much says that any variable-length datatypes that are not strings are not supported (if I understood correctly).
My miserable dysfunctional code (as is):
public void WIP_createVLenFloatDataSet( List<? extends Number> floats ) throws Exception
{
String group = "/test";
long groupId = createGroupIfNotExist( group );
MDataQualifier qualifier = new MDataQualifierImpl( group, "float", "0.0.0" );
long datasetId = openDataSet( qualifier );
long heapType = H5.H5Tcopy( MDataType.FLOAT_ARRAY.getHDFType() );
heapType = H5.H5Tvlen_create( heapType );
// heapType = H5.H5Tarray_create( heapType, 1, new long[]{1} );
if( !exists( datasetId ) )
{
long[] maxDims = new long[]{ HDF5Constants.H5S_UNLIMITED };
long dataspaceId = H5.H5Screate_simple( 1, new long[]{ 1 }, null );
// Create the dataset.
long datasetId1 = -1;
try
{
if( exists( m_fileId ) && exists( dataspaceId ) && exists( heapType ) )
{
long creationProperties = H5.H5Pcreate( HDF5Constants.H5P_DATASET_CREATE );
H5.H5Pset_chunk( creationProperties, /*ndims*/1, new long[]{ 1 } );
datasetId1 = H5.H5Dcreate( groupId, qualifier.getVersionedName(), heapType, dataspaceId, H5P_DEFAULT, creationProperties, H5P_DEFAULT );
// H5.H5Pclose( creationProperties );
}
}
catch( Exception e )
{
LOG.error( "Problems creating the dataset: " + e.getMessage(), e );
}
datasetId = datasetId1;
if( exists( datasetId ) )
{
// flushIfNecessary();
LOG.trace( "Wrote empty dataset {}", qualifier.getVersionedName() );
}
}
List<? extends Number> data = ( List<? extends Number> )floats;
// H5.H5Dwrite( datasetId, heapType, dataspaceId, memSpaceId, HDF5Constants.H5P_DEFAULT, Floats.toArray( data) );
ByteBuffer bb = ByteBuffer.allocate( data.size() * 4 );
floats.forEach( f -> bb.putFloat( f.floatValue() ) );
// H5.H5Dwrite( datasetId, heapType, H5S_ALL, H5S_ALL, H5P_DEFAULT, Floats.toArray( data ) );
H5.H5Dwrite( datasetId, heapType, H5S_ALL, H5S_ALL, H5P_DEFAULT, bb.array() );
}
Has anyone done this before and can at least confirm that it's not possible?
The most I can get out of HDF5 is the message "buf does not support variable length type".
Apparently the "glue code" of the JNI wrapper doesn't support this. If you want to use this feature you either have to implement your own JNI or wait for a newer version. The official JNI code is open source and can be found here.

Can you access private variables using OGNL?

Is there anyway using OGNL to access private variables that aren't exposed as a bean property (ie no get/set method pair)? I wanted to use OGNL as a faster, cleaner method of reflection for use in unit tests.
Here is my code:
#Test
public void shouldSetup() throws OgnlException{
class A{
private Object b = "foo";
private Object c = "bar";
public Object getB(){ return b; }
}
A a = new A();
System.out.println( "This'll work..." );
Ognl.getValue( "b", a );
System.out.println( "...and this'll fail..." );
Ognl.getValue( "c", a );
System.out.println( "...and we'll never get here." );
}
Actually you can. You need to set MemberAccess in OgnlContext to allow access to non public fields and use getValue(ExpressionAccessor expression, OgnlContext context, Object root) method to retrieve value.
#Test
public void shouldSetup() throws OgnlException {
class A {
private Object b = "foo";
private Object c = "bar";
public Object getB() { return b; }
}
A a = new A();
// set DefaultMemberAccess with allowed access into the context
OgnlContext context = new OgnlContext();
context.setMemberAccess(new DefaultMemberAccess(true));
System.out.println( "This'll work..." );
// use context in getValue method
Ognl.getValue( "b", context, a );
System.out.println( "...and this'll work..." );
// use context in getValue method
Ognl.getValue( "c", context, a );
System.out.println( "...and we'll get here." );
}
The best I could do was a work around using reflection directly:
static Object getValue( final String ognlPath, final Object o ){
final StringTokenizer st = new StringTokenizer( ognlPath, "." );
Object currentO = o;
String nextFieldName = null;
try{
while( st.hasMoreTokens() ){
nextFieldName = st.nextToken();
if( currentO == null ){
throw new IllegalStateException( "Cannot find field '" + nextFieldName + "' on null object." );
}
Field f = currentO.getClass().getDeclaredField( nextFieldName );
f.setAccessible( true );
currentO = f.get( currentO );
}
return currentO;
}catch( NoSuchFieldException e ){
throw new RuntimeException( "Could not find field '" + nextFieldName + "' on " + currentO.getClass().getCanonicalName(), e );
}catch( IllegalAccessException e ){
throw new RuntimeException( "Failed to get from path '" + ognlPath + "' on object: " + o, e );
}
}

Solr WordDelimiterFilter + Lucene Highlighter

I am trying to get the Highlighter class from Lucene to work properly with tokens coming from Solr's WordDelimiterFilter. It works 90% of the time, but if the matching text contains a ',' such as "1,500" the output is incorrect:
Expected: 'test 1,500 this'
Observed: 'test 11,500 this'
I am not currently sure whether it is Highlighter messing up the recombination or WordDelimiterFilter messing up the tokenization but something is unhappy. Here are the relevant dependencies from my pom:
org.apache.lucene
lucene-core
2.9.3
jar
compile
org.apache.lucene
lucene-highlighter
2.9.3
jar
compile
org.apache.solr
solr-core
1.4.0
jar
compile
And here is a simple JUnit test class demonstrating the problem:
package test.lucene;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.StandardTokenizerFactory;
import org.apache.solr.analysis.WordDelimiterFilterFactory;
import org.junit.Test;
public class HighlighterTester {
private static final String PRE_TAG = "<b>";
private static final String POST_TAG = "</b>";
private static String[] highlightField( Query query, String fieldName, String text )
throws IOException, InvalidTokenOffsetsException {
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter( PRE_TAG, POST_TAG );
Highlighter highlighter = new Highlighter( formatter, new QueryScorer( query, fieldName ) );
highlighter.setTextFragmenter( new SimpleFragmenter( Integer.MAX_VALUE ) );
return highlighter.getBestFragments( getAnalyzer(), fieldName, text, 10 );
}
private static Analyzer getAnalyzer() {
return new Analyzer() {
#Override
public TokenStream tokenStream( String fieldName, Reader reader ) {
// Start with a StandardTokenizer
TokenStream stream = new StandardTokenizerFactory().create( reader );
// Chain on a WordDelimiterFilter
WordDelimiterFilterFactory wordDelimiterFilterFactory = new WordDelimiterFilterFactory();
HashMap<String, String> arguments = new HashMap<String, String>();
arguments.put( "generateWordParts", "1" );
arguments.put( "generateNumberParts", "1" );
arguments.put( "catenateWords", "1" );
arguments.put( "catenateNumbers", "1" );
arguments.put( "catenateAll", "0" );
wordDelimiterFilterFactory.init( arguments );
return wordDelimiterFilterFactory.create( stream );
}
};
}
#Test
public void TestHighlighter() throws ParseException, IOException, InvalidTokenOffsetsException {
String fieldName = "text";
String text = "test 1,500 this";
String queryString = "1500";
String expected = "test " + PRE_TAG + "1,500" + POST_TAG + " this";
QueryParser parser = new QueryParser( Version.LUCENE_29, fieldName, getAnalyzer() );
Query q = parser.parse( queryString );
String[] observed = highlightField( q, fieldName, text );
for ( int i = 0; i < observed.length; i++ ) {
System.out.println( "\t" + i + ": '" + observed[i] + "'" );
}
if ( observed.length > 0 ) {
System.out.println( "Expected: '" + expected + "'\n" + "Observed: '" + observed[0] + "'" );
assertEquals( expected, observed[0] );
}
else {
assertTrue( "No matches found", false );
}
}
}
Anyone have any ideas or suggestions?
After further investigation, this appears to be a bug in the Lucene Highlighter code. As you can see here:
public class TokenGroup {
...
protected boolean isDistinct() {
return offsetAtt.startOffset() >= endOffset;
}
...
The code attempts to determine if a group of tokens is distinct by checking to see if the start offset is greater than the previous end offset. The problem with this approach is illustrated by this issue. If you were to step through the tokens, you would see that they are as follows:
0-4: 'test', 'test'
5-6: '1', '1'
7-10: '500', '500'
5-10: '1500', '1,500'
11-15: 'this', 'this'
From this you can see that the third token starts after the end of the second, but the fourth starts the same place as the second. The intended outcome would be to group tokens 2, 3, and 4, but per this implementation, token 3 is seen as separate from 2, so 2 shows up by itself, then 3 and 4 get grouped leaving this outcome:
Expected: 'test <b>1,500</b> this'
Observed: 'test 1<b>1,500</b> this'
I'm not sure this can be accomplished without 2 passes, one to get all the indexes and a second to combine them. Also, I'm not sure what the implications would be outside of this specific case. Does anyone have any ideas here?
EDIT
Here is the final source code I came up with. It will group things correctly. It also appears to be MUCH simpler than the Lucene Highlighter implementation, but admittedly does not handle different levels of scoring as my application only needs a yes/no as to whether a fragment of text gets highlighted. Its also worth noting that I am using their QueryScorer to score the text fragments which does have the weakness of being Term oriented rather than Phrase oriented which means the search string "grammatical or spelling" would end up with highlighting that looks something like this "grammatical or spelling" as the or would most likely get dropped by your analyzer. Anyway, here is my source:
public TextFragments<E> getTextFragments( TokenStream tokenStream,
String text,
Scorer scorer )
throws IOException, InvalidTokenOffsetsException {
OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute( OffsetAttribute.class );
TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute( TermAttribute.class );
TokenStream newStream = scorer.init( tokenStream );
if ( newStream != null ) {
tokenStream = newStream;
}
TokenGroups tgs = new TokenGroups();
scorer.startFragment( null );
while ( tokenStream.incrementToken() ) {
tgs.add( offsetAtt.startOffset(), offsetAtt.endOffset(), scorer.getTokenScore() );
if ( log.isTraceEnabled() ) {
log.trace( new StringBuilder()
.append( scorer.getTokenScore() )
.append( " " )
.append( offsetAtt.startOffset() )
.append( "-" )
.append( offsetAtt.endOffset() )
.append( ": '" )
.append( termAtt.term() )
.append( "', '" )
.append( text.substring( offsetAtt.startOffset(), offsetAtt.endOffset() ) )
.append( "'" )
.toString() );
}
}
return tgs.fragment( text );
}
private class TokenGroup {
private int startIndex;
private int endIndex;
private float score;
public TokenGroup( int startIndex, int endIndex, float score ) {
this.startIndex = startIndex;
this.endIndex = endIndex;
this.score = score;
}
}
private class TokenGroups implements Iterable<TokenGroup> {
private List<TokenGroup> tgs;
public TokenGroups() {
tgs = new ArrayList<TokenGroup>();
}
public void add( int startIndex, int endIndex, float score ) {
add( new TokenGroup( startIndex, endIndex, score ) );
}
public void add( TokenGroup tg ) {
for ( int i = tgs.size() - 1; i >= 0; i-- ) {
if ( tg.startIndex < tgs.get( i ).endIndex ) {
tg = merge( tg, tgs.remove( i ) );
}
else {
break;
}
}
tgs.add( tg );
}
private TokenGroup merge( TokenGroup tg1, TokenGroup tg2 ) {
return new TokenGroup( Math.min( tg1.startIndex, tg2.startIndex ),
Math.max( tg1.endIndex, tg2.endIndex ),
Math.max( tg1.score, tg2.score ) );
}
private TextFragments<E> fragment( String text ) {
TextFragments<E> fragments = new TextFragments<E>();
int lastEndIndex = 0;
for ( TokenGroup tg : this ) {
if ( tg.startIndex > lastEndIndex ) {
fragments.add( text.substring( lastEndIndex, tg.startIndex ), textModeNormal );
}
fragments.add(
text.substring( tg.startIndex, tg.endIndex ),
tg.score > 0 ? textModeHighlighted : textModeNormal );
lastEndIndex = tg.endIndex;
}
if ( lastEndIndex < ( text.length() - 1 ) ) {
fragments.add( text.substring( lastEndIndex ), textModeNormal );
}
return fragments;
}
#Override
public Iterator<TokenGroup> iterator() {
return tgs.iterator();
}
}
Here's a possible cause.
Your highlighter needs to use the same Analyzer used for search. IIUC, Your code uses a default analyzer for the highlighting, even though it uses a specialized analyzer for parsing the query. I believe you need to change the Fragmenter to work with your specific TokenStream.

Categories