Project

General

Profile

« Previous | Next » 

Revision 40655

created branches folder for dnet-index-solr-client

View differences:

modules/dnet-index-solr-common/branches/solr5/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-index-solr-common/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-index-solr-common"}
modules/dnet-index-solr-common/branches/solr5/src/main/java/eu/dnetlib/functionality/index/solr/feed/StreamingInputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.io.StringReader;
4
import java.io.StringWriter;
5
import java.util.Iterator;
6
import java.util.List;
7
import javax.xml.stream.*;
8
import javax.xml.stream.events.Namespace;
9
import javax.xml.stream.events.StartElement;
10
import javax.xml.stream.events.XMLEvent;
11

  
12
import com.google.common.collect.Lists;
13
import eu.dnetlib.functionality.index.solr.feed.ResultTransformer.Mode;
14
import org.apache.solr.common.SolrInputDocument;
15

  
16
/**
17
 * Optimized version of the document parser, drop in replacement of InputDocumentFactory.
18
 *
19
 * <p>
20
 * Faster because:
21
 * </p>
22
 * <ul>
23
 * <li>Doesn't create a DOM for the full document</li>
24
 * <li>Doesn't execute xpaths agains the DOM</li>
25
 * <li>Quickly serialize the 'result' element directly in a string.</li>
26
 * <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
27
 * </ul>
28
 *
29
 * <p>
30
 * This class is fully reentrant and can be invoked in parallel.
31
 * </p>
32
 *
33
 * @author marko
34
 *
35
 */
36
public class StreamingInputDocumentFactory extends InputDocumentFactory {
37

  
38
	protected static final String DEFAULTDNETRESULT = "dnetResult";
39

  
40
	protected static final String TARGETFIELDS = "targetFields";
41

  
42
	protected static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
43

  
44
	protected static final String ROOT_ELEMENT = "indexRecord";
45

  
46
	protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
47

  
48
		@Override
49
		protected XMLInputFactory initialValue() {
50
			return XMLInputFactory.newInstance();
51
		}
52
	};
53

  
54
	protected ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
55

  
56
		@Override
57
		protected XMLOutputFactory initialValue() {
58
			return XMLOutputFactory.newInstance();
59
		}
60
	};
61

  
62
	protected ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
63

  
64
		@Override
65
		protected XMLEventFactory initialValue() {
66
			return XMLEventFactory.newInstance();
67
		}
68
	};
69

  
70
	/**
71
	 * {@inheritDoc}
72
	 *
73
	 * @see eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory#parseDocument(eu.dnetlib.functionality.index.solr.feed.IndexDocument,
74
	 * java.lang.String)
75
	 */
76
	@Override
77
	public SolrInputDocument parseDocument(final String version, final String inputDocument, final String dsId, final String resultName)
78
			throws XMLStreamException {
79
		return parseDocument(version, inputDocument, dsId, resultName, null);
80
	}
81

  
82
	/**
83
	 * {@inheritDoc}
84
	 *
85
	 * @see eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory#parseDocument(eu.dnetlib.functionality.index.solr.feed.IndexDocument,
86
	 *      java.lang.String, com.google.common.base.Function)
87
	 */
88
	@Override
89
	public SolrInputDocument parseDocument(final String version,
90
			final String inputDocument,
91
			final String dsId,
92
			final String resultName,
93
			final ResultTransformer resultTransformer) {
94

  
95
		final StringWriter results = new StringWriter();
96
		final List<Namespace> nsList = Lists.newLinkedList();
97
		try {
98

  
99
			XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
100

  
101
			final SolrInputDocument indexDocument = new SolrInputDocument();
102

  
103
			while (parser.hasNext()) {
104
				final XMLEvent event = parser.nextEvent();
105
				if ((event != null) && event.isStartElement()) {
106
					final String localName = event.asStartElement().getName().getLocalPart();
107

  
108
					if (ROOT_ELEMENT.equals(localName)) {
109
						nsList.addAll(getNamespaces(event));
110
					} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
111
						final XMLEvent text = parser.nextEvent();
112
						String recordId = getText(text);
113
						indexDocument.addField(INDEX_RECORD_ID, recordId);
114
					} else if (TARGETFIELDS.equals(localName)) {
115
						parseTargetFields(indexDocument, parser);
116
					} else if (resultName.equals(localName)) {
117
						if (resultTransformer == null || !(Mode.empty.equals(resultTransformer.getMode()))) {
118
							copyResult(indexDocument, results, parser, nsList, resultName, resultTransformer);
119
						}
120
					}
121
				}
122
			}
123

  
124
			if (version != null) {
125
				indexDocument.addField(DS_VERSION, version);
126
			}
127

  
128
			if (dsId != null) {
129
				indexDocument.addField(DS_ID, dsId);
130
			}
131

  
132
			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
133
				indexDocument.clear();
134
				System.err.println("missing indexrecord id:\n" + inputDocument);
135
			}
136

  
137
			return indexDocument;
138
		} catch (XMLStreamException e) {
139
			return new SolrInputDocument();
140
		}
141
	}
142

  
143
	private List<Namespace> getNamespaces(final XMLEvent event) {
144
		final List<Namespace> res = Lists.newLinkedList();
145
		@SuppressWarnings("unchecked")
146
		Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
147
		while (nsIter.hasNext()) {
148
			Namespace ns = nsIter.next();
149
			res.add(ns);
150
		}
151
		return res;
152
	}
153

  
154
	/**
155
	 * Parse the targetFields block and add fields to the solr document.
156
	 *
157
	 * @param indexDocument
158
	 * @param parser
159
	 * @throws XMLStreamException
160
	 */
161
	protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
162

  
163
		boolean hasFields = false;
164

  
165
		while (parser.hasNext()) {
166
			final XMLEvent targetEvent = parser.nextEvent();
167
			if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
168
				break;
169
			}
170

  
171
			if (targetEvent.isStartElement()) {
172
				final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
173
				final XMLEvent text = parser.nextEvent();
174

  
175
				String data = getText(text);
176

  
177
				addField(indexDocument, fieldName, data);
178
				hasFields = true;
179
			}
180
		}
181

  
182
		if (!hasFields) {
183
			indexDocument.clear();
184
		}
185
	}
186

  
187
	/**
188
	 * Copy the /indexRecord/result element and children, preserving namespace declarations etc.
189
	 *
190
	 * @param indexDocument
191
	 * @param results
192
	 * @param parser
193
	 * @param nsList
194
	 * @throws XMLStreamException
195
	 */
196
	protected void copyResult(final SolrInputDocument indexDocument,
197
			final StringWriter results,
198
			final XMLEventReader parser,
199
			final List<Namespace> nsList,
200
			final String dnetResult,
201
			final ResultTransformer resultTransformer) throws XMLStreamException {
202
		final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
203

  
204
		for (Namespace ns : nsList) {
205
			eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
206
		}
207

  
208
		StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
209

  
210
		// new root record
211
		writer.add(newRecord);
212

  
213
		// copy the rest as it is
214
		while (parser.hasNext()) {
215
			final XMLEvent resultEvent = parser.nextEvent();
216

  
217
			// TODO: replace with depth tracking instead of close tag tracking.
218
			if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
219
				writer.add(eventFactory.get().createEndElement("", null, RESULT));
220
				break;
221
			}
222

  
223
			writer.add(resultEvent);
224
		}
225
		writer.close();
226

  
227
		if (resultTransformer != null) {
228
			indexDocument.addField(INDEX_RESULT, resultTransformer.apply(results.toString()));
229
		} else {
230
			indexDocument.addField(INDEX_RESULT, results.toString());
231
		}
232
	}
233

  
234
	/**
235
	 * Helper used to add a field to a solr doc. It avoids to add empy fields
236
	 *
237
	 * @param indexDocument
238
	 * @param field
239
	 * @param value
240
	 */
241
	private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
242
		String cleaned = value.trim();
243
		if (!cleaned.isEmpty()) {
244
			// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
245
			indexDocument.addField(field.toLowerCase(), cleaned);
246
		}
247
	}
248

  
249
	/**
250
	 * Helper used to get the string from a text element.
251
	 *
252
	 * @param text
253
	 * @return
254
	 */
255
	protected final String getText(final XMLEvent text) {
256
		if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
257
			return "";
258

  
259
		return text.asCharacters().getData();
260
	}
261

  
262
}
modules/dnet-index-solr-common/branches/solr5/src/main/java/eu/dnetlib/functionality/index/solr/feed/ResultTransformer.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import com.google.common.base.Function;
4

  
5
/**
6
 * Created by claudio on 17/11/15.
7
 */
8
public abstract class ResultTransformer implements Function<String, String> {
9

  
10
	public enum Mode {compress, empty, xslt}
11

  
12
	protected Mode mode;
13

  
14
	public ResultTransformer(final Mode mode) {
15
		this.mode = mode;
16
	}
17

  
18
	public Mode getMode() {
19
		return mode;
20
	}
21

  
22
	public void setMode(final Mode mode) {
23
		this.mode = mode;
24
	}
25

  
26
}
modules/dnet-index-solr-common/branches/solr5/src/main/java/eu/dnetlib/functionality/index/solr/feed/InputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.text.ParseException;
4
import java.text.SimpleDateFormat;
5
import java.util.Arrays;
6
import java.util.List;
7
import javax.xml.stream.XMLStreamException;
8

  
9
import org.apache.solr.common.SolrInputDocument;
10
import org.dom4j.DocumentException;
11

  
12
/**
13
 *
14
 * @author claudio
15
 *
16
 */
17
public abstract class InputDocumentFactory {
18

  
19
	public static final String INDEX_FIELD_PREFIX = "__";
20

  
21
	public static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
22

  
23
	public static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
24

  
25
	public static final String RESULT = "result";
26

  
27
	public static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
28

  
29
	public static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
30

  
31
	private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
32

  
33
	private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
34

  
35
	public abstract SolrInputDocument parseDocument(final String version,
36
			final String inputDocument,
37
			final String dsId,
38
			final String resultName) throws XMLStreamException;
39

  
40
	public abstract SolrInputDocument parseDocument(final String version,
41
			final String inputDocument,
42
			final String dsId,
43
			final String resultName,
44
			final ResultTransformer resultTransformer) throws XMLStreamException;
45

  
46
	/**
47
	 * method return a solr-compatible string representation of a date
48
	 *
49
	 * @param date
50
	 * @return
51
	 * @throws DocumentException
52
	 * @throws ParseException
53
	 */
54
	public static String getParsedDateField(final String date) {
55
		for (String formatString : dateFormats) {
56
			try {
57
				return new SimpleDateFormat(outFormat).format(new SimpleDateFormat(formatString).parse(date));
58
			} catch (ParseException e) {}
59
		}
60
		throw new IllegalStateException("unable to parse date: " + date);
61
	}
62

  
63
	public String parseDate(final String date) {
64
		return getParsedDateField(date);
65
	}
66

  
67
}
modules/dnet-index-solr-common/branches/solr5/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-parent</artifactId>
6
		<version>1.0.0-SNAPSHOT</version>
7
		<relativePath/>
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-index-solr-common</artifactId>
12
	<version>3.0.0-SNAPSHOT</version>
13
	<scm>
14
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-index-solr-common/trunk</developerConnection>
15
	</scm>
16
	<properties>
17
		<apache.solr.version>5.4.0</apache.solr.version>
18
	</properties>
19
	<dependencies>
20
		<dependency>
21
			<groupId>org.apache.solr</groupId>
22
			<artifactId>solr-solrj</artifactId>
23
			<version>${apache.solr.version}</version>
24
			<exclusions>
25
				<exclusion>
26
					<artifactId>wstx-asl</artifactId>
27
					<groupId>org.codehaus.woodstox</groupId>
28
				</exclusion>
29
			</exclusions>
30
		</dependency>
31
		<dependency>
32
			<groupId>dom4j</groupId>
33
			<artifactId>dom4j</artifactId>
34
			<version>1.6.1</version>
35
		</dependency>
36
		<dependency>
37
			<groupId>com.google.guava</groupId>
38
			<artifactId>guava</artifactId>
39
			<version>${google.guava.version}</version>
40
		</dependency>
41

  
42
	</dependencies>
43
</project>

Also available in: Unified diff