Project

General

Profile

« Previous | Next » 

Revision 36372

branch for testing the upgrade to CDH 5.3.X

View differences:

modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/actionmanager/xslt/datacite2updateActions.xslt
1
<?xml version="1.0" encoding="UTF-8"?>
2
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
3
    xmlns:dnet="eu.dnetlib.actionmanager.actions.infopackage.DataciteInfoPackageToHbaseXsltFunctions"
4
    xmlns:oaf="http://namespace.openaire.eu/oaf"
5
    xmlns:dri="http://www.driver-repository.eu/namespace/dri"
6
   	xmlns:date="eu.dnetlib.miscutils.datetime.DateUtils"
7
    xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:exslt="http://exslt.org/common"
8
    extension-element-prefixes="exslt" exclude-result-prefixes="xsl dnet exslt oaf dr dri date">
9

  
10
    <xsl:output omit-xml-declaration="yes" indent="yes"/>
11

  
12
    <xsl:param name="trust" select="string('0.9')"/>
13
    <xsl:param name="provenance" select="string('UNKNOWN')"/>
14
    <xsl:param name="namespaceprefix" select="string('datacite____')"/>
15

  
16
    <xsl:template match="/*">
17
		<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/>
18
        <xsl:variable name="metadata" select="exslt:node-set(/*[local-name()='record']/*[local-name()='metadata']/*[local-name()='resource'])"/>
19
        <xsl:variable name="rightNSPrefix">
20
            <xsl:choose>
21
                <xsl:when test="not($namespaceprefix)">
22
                    <xsl:value-of select="//oaf:datasourceprefix"/>
23
                </xsl:when>
24
                <xsl:otherwise>
25
                    <xsl:value-of select="$namespaceprefix"/>
26
                </xsl:otherwise>
27
            </xsl:choose>
28
        </xsl:variable>
29

  
30
        <xsl:choose>
31
            <xsl:when test="count($metadata) =  0">
32
                <ACTIONS/>
33
            </xsl:when>
34
            <xsl:otherwise>
35
                <xsl:variable name="originalId" select="//*[local-name() = 'identifier' and ./@identifierType='DOI']"/>
36
                <xsl:variable name="resultId" select="dnet:oafSimpleId('result', //dri:objIdentifier)"/>
37

  
38
                <xsl:variable name="creators" select="//*[local-name() = 'creator']"/>
39
                <xsl:variable name="titles" select="//*[local-name() = 'title']"/>
40
                <xsl:variable name="subjects" select="//*[local-name() = 'subject']"/>
41
                <xsl:variable name="publisher" select="//*[local-name() = 'publisher']"/>
42
                <xsl:variable name="descriptions" select="//*[local-name() = 'description']"/>
43
                <xsl:variable name="dates" select="//*[local-name() = 'date']"/>
44
                <xsl:variable name="dateaccepted" select="//oaf:dateAccepted" />
45
                <xsl:variable name="resourceType" select="//*[local-name() = 'resourceType']"/>
46
                <xsl:variable name="formats" select="//*[local-name() = 'format']"/>
47
                <xsl:variable name="sizes" select="//*[local-name() = 'size']"/>
48
                <xsl:variable name="rights" select="//oaf:accessrights" />
49
                <xsl:variable name="version" select="//*[local-name() = 'version']"/>
50
                <xsl:variable name="instanceURI"
51
                    select="concat('http://dx.doi.org','/',//*[local-name() = 'resource']/*[local-name() = 'identifier'])"/>
52
                <xsl:variable name="hostedbyid"
53
                    select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)"/>
54
                <xsl:variable name="hostedbyname" select="//oaf:hostedBy/@name"/>
55
                <xsl:variable name="collectedfromid"
56
                    select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)"/>
57
                <xsl:variable name="collectedfromname" select="//oaf:collectedFrom/@name"/>
58
                <xsl:variable name="dateOfCollection" select="//dr:dateOfCollection"/>
59
                <xsl:variable name="language" select="//oaf:language" />
60
				<xsl:variable name="cobjcategory" select="//dr:CobjCategory" />                
61

  
62
                <ACTIONS>
63
                    <ACTION targetKey="{$resultId}" targetColumnFamily="result" targetColumn="{concat('update_', date:now())}">
64
                        <xsl:value-of
65
                            select="dnet:oafDataCiteResultFromInfoPackage($resultId, $dataInfo, $metadata, $titles, 
66
                            $subjects, $publisher, $descriptions, $dates, $dateaccepted, $resourceType, 
67
                            $formats, $sizes, $language, $cobjcategory, $rights, $version, $provenance, $trust, $hostedbyid, $hostedbyname,
68
                            $collectedfromid, $collectedfromname, $originalId, $instanceURI, $dateOfCollection)"
69
                        />
70
                    </ACTION>
71
                </ACTIONS>
72
            </xsl:otherwise>
73
        </xsl:choose>
74
    </xsl:template>
75
</xsl:stylesheet>
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/actionmanager/xslt/dmf2updateActions.xslt
1
<?xml version="1.0" encoding="UTF-8"?>
2
<xsl:stylesheet version="1.0"
3
	xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/elements/1.1/"
4
	xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:dri="http://www.driver-repository.eu/namespace/dri"
5
	xmlns:oaa="http://namespace.openaire.eu/oaa" xmlns:oaf="http://namespace.openaire.eu/oaf"
6
	xmlns:dnet="eu.dnetlib.actionmanager.actions.infopackage.DMFInfoPackageToHbaseXsltFunctions"
7
	xmlns:date="eu.dnetlib.miscutils.datetime.DateUtils"
8
	xmlns:exslt="http://exslt.org/common" xmlns:action="http://namespace.openaire.eu/action" 
9
	extension-element-prefixes="exslt"
10
	exclude-result-prefixes="xsl dc dr dri oaa oaf dnet exslt date">
11

  
12
	<xsl:output omit-xml-declaration="yes" indent="yes" />
13

  
14
	<xsl:param name="trust" select="string('0.9')" />
15
	<xsl:param name="provenance" select="string('UNKNOWN')" />
16
	<xsl:param name="namespaceprefix" select="string('unknown_____')" />
17

  
18
	<xsl:template match="/*">
19
		<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/>
20
		<xsl:variable name="dateofcollection" select="//dr:dateOfCollection"/>
21

  
22
		<xsl:variable name="metadata" select="exslt:node-set(//*[local-name()='metadata']/*)" />
23
		<xsl:variable name="collectedDatasourceid">
24
			<xsl:choose>
25
				<xsl:when test="string-length(//oaf:collectedDatasourceid) &gt; 0">
26
					<xsl:value-of select="//oaf:collectedDatasourceid" />
27
				</xsl:when>
28
				<xsl:otherwise>
29
					<xsl:value-of select="UNKNOWN" />
30
				</xsl:otherwise>
31
			</xsl:choose>
32
		</xsl:variable>
33

  
34
		<xsl:choose>
35
			<xsl:when test="count($metadata) = 0">
36
				<ACTIONS />
37
			</xsl:when>
38
			<xsl:otherwise>
39
			
40
				<xsl:variable name="objidentifier" select="/record/*[local-name() = 'header']/*[local-name() = 'objIdentifier']" />
41

  
42
				<xsl:variable name="resultId" select="dnet:oafSimpleId('result', $objidentifier)" />
43
				
44
				<xsl:variable name="hostedbyid" select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)" />
45
				<xsl:variable name="hostedbyname" select="//oaf:hostedBy/@name" />
46
				
47
				<xsl:variable name="collectedfromid" select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)" />
48
				<xsl:variable name="collectedfromname" select="//oaf:collectedFrom/@name" />		
49
				
50
<!-- 				<xsl:variable name="country" select="substring(//dr:repositoryCountry,  1, 200)" /> -->
51
<!-- 				<xsl:variable name="accessmode" select="substring(//oaf:accessrights,   1, 200)" /> -->
52
				
53
				<xsl:variable name="result" select="dnet:oafResultFromInfoPackage($resultId, $dataInfo, $provenance, $trust, $hostedbyid, $hostedbyname, $collectedfromid, $collectedfromname, $objidentifier, $dateofcollection, $metadata)" />
54

  
55
				<ACTIONS> 
56
					<ACTION targetKey="{$resultId}" targetColumnFamily="result" targetColumn="{concat('update_', date:now())}">
57
						<xsl:value-of select="$result" />
58
					</ACTION>
59
				</ACTIONS>
60
			</xsl:otherwise>
61
		</xsl:choose>
62
	</xsl:template>
63

  
64
</xsl:stylesheet>
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import org.junit.Before;
4
import org.junit.Test;
5

  
6
import eu.dnetlib.pace.AbstractProtoPaceTest;
7
import eu.dnetlib.pace.config.Config;
8
import eu.dnetlib.pace.config.Type;
9
import eu.dnetlib.pace.model.FieldListImpl;
10
import eu.dnetlib.pace.model.FieldValueImpl;
11
import eu.dnetlib.pace.model.MapDocument;
12

  
13
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
14

  
15
	private Config config;
16

  
17
	@Before
18
	public void setUp() {
19
		config = getResultFullConf();
20
	}
21

  
22
	@Test
23
	public void testCombine() {
24
		MapDocument result = result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
25
		FieldListImpl fl = new FieldListImpl();
26
		fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
27

  
28
		result.getFieldMap().put("desc", fl);
29

  
30
		fl.clear();
31
		fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
32
		result.getFieldMap().get("title").add(fl);
33

  
34
		System.out.println(BlacklistAwareClusteringCombiner.filterAndCombine(result, config, config.blacklists()));
35
	}
36
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/mapreduce/util/OafDecoderTest.java
1
package eu.dnetlib.data.mapreduce.util;
2

  
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertNotNull;
5

  
6
import java.util.List;
7

  
8
import org.junit.Test;
9

  
10
import eu.dnetlib.data.proto.KindProtos.Kind;
11
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
12

  
13
public class OafDecoderTest {
14

  
15
	@Test
16
	public void testAsXml() {
17

  
18
		final OafDecoder decoder = OafTest.embed(OafTest.getResult("50|id_1"), Kind.entity);
19

  
20
		assertNotNull(decoder);
21

  
22
		assertNotNull(decoder.asXml());
23

  
24
		System.out.println(IndentXmlString.apply(decoder.asXml()));
25

  
26
	}
27

  
28
	@Test
29
	public void testGetFieldValues() {
30
		final OafDecoder decoder = OafTest.embed(OafTest.getResult("50|id_1"), Kind.entity);
31

  
32
		final String path = "result/metadata/title/value";
33
		final List<String> titles = decoder.decodeEntity().getFieldValues("title", path);
34

  
35
		assertNotNull(titles);
36
		assertFalse(titles.isEmpty());
37
	}
38
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/data/transform/datacite_2_hbase.xsl
1
<?xml version="1.0" encoding="UTF-8"?>
2
<xsl:stylesheet version="1.0"
3
	xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/elements/1.1/"
4
	xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:dri="http://www.driver-repository.eu/namespace/dri"
5
	xmlns:oaa="http://namespace.openaire.eu/oaa" xmlns:oaf="http://namespace.openaire.eu/oaf"
6
	xmlns:dnet="eu.dnetlib.data.transform.xml.DNetMdStoreDataCiteToHbaseXsltFunctions"
7
	xmlns:exslt="http://exslt.org/common" extension-element-prefixes="exslt"
8
	exclude-result-prefixes="xsl dc dr dri oaa oaf dnet exslt">
9

  
10
	<xsl:output omit-xml-declaration="yes" indent="yes" />
11
	<xsl:template match="/*">
12
		<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/>
13
		<xsl:variable name="dateofcollection" select="//dri:dateOfCollection" />
14
		<xsl:variable name="metadata"
15
			select="exslt:node-set(//*[local-name()='metadata']/*)" />
16
		<xsl:variable name="namespaceprefix">
17
			<xsl:choose>
18
				<!-- TODO check namespaceprefix length is 12 -->
19
				<xsl:when test="string-length(//oaf:datasourceprefix) &gt; 0">
20
					<xsl:value-of select="//oaf:datasourceprefix" />
21
				</xsl:when>
22
				<xsl:otherwise>
23
					<xsl:value-of select="unknown_____" />
24
				</xsl:otherwise>
25
			</xsl:choose>
26
		</xsl:variable>
27

  
28
		<xsl:choose>
29
			<xsl:when
30
				test="count($metadata) = 0 or normalize-space(//oaf:skipRecord)= 'true'">
31
				<ROWS />
32
			</xsl:when>
33
			<xsl:otherwise>
34

  
35
				<xsl:variable name="resultId"
36
					select="dnet:oafSimpleId('result', //dri:objIdentifier)" />
37

  
38
				<xsl:if test="string-length($resultId) &gt; 0">
39
					<xsl:variable name="originalid"
40
						select="concat('',  //*[local-name() = 'resource']/*[local-name()='identifier'])" />
41
					<xsl:variable name="creators" select="//*[local-name() = 'creator']" />
42
					<xsl:variable name="titles" select="//*[local-name() = 'title']" />
43
					<xsl:variable name="subjects" select="//*[local-name() = 'subject']" />
44
					<xsl:variable name="publisher" select="//*[local-name() = 'publisher']" />
45
					<xsl:variable name="descriptions" select="//*[local-name() = 'description']" />
46
					<xsl:variable name="dates" select="//*[local-name() = 'date']" />
47
					<xsl:variable name="dateaccepted" select="//oaf:dateAccepted" />
48
					<xsl:variable name="resourceType"
49
						select="//*[local-name() = 'resourceType']" />
50
					<xsl:variable name="formats" select="//*[local-name() = 'format']" />
51
					<xsl:variable name="sizes" select="//*[local-name() = 'size']" />
52
					<xsl:variable name="rights" select="//oaf:accessrights" />
53
					<xsl:variable name="version" select="//*[local-name() = 'version']" />
54
					<xsl:variable name="language" select="//oaf:language" />
55
					<xsl:variable name="cobjcategory" select="//dr:CobjCategory" />
56

  
57
					<xsl:variable name="instanceURI">
58
						<xsl:choose>
59
							<xsl:when
60
								test="string-length( //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='DOI']) &gt; 0">
61
								<xsl:value-of
62
									select="concat('http://dx.doi.org','/', //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='DOI']/text())" />
63
							</xsl:when>
64
							<xsl:otherwise>
65
								<xsl:value-of
66
									select="concat('', //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='URL'])" />
67
							</xsl:otherwise>
68
						</xsl:choose>
69
					</xsl:variable>
70

  
71
					<xsl:variable name="hostedbyid"
72
						select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)" />
73
					<xsl:variable name="hostedbyname" select="concat('', //oaf:hostedBy/@name)" />
74
					<xsl:variable name="collectedfromid"
75
						select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)" />
76
					<xsl:variable name="collectedfromname"
77
						select="concat('', //oaf:collectedFrom/@name)" />
78
					<xsl:variable name="dateOfCollection"
79
						select="concat('', //dri:dateOfCollection)" />
80

  
81
					<xsl:variable name="result"
82
						select="dnet:oafResult_FromDatacite($resultId, $dataInfo, $metadata, $titles, 
83
                        $subjects, $publisher, $descriptions, $dates, $dateaccepted, $resourceType, 
84
                        $formats, $sizes, $language, $cobjcategory, $rights, $version, $hostedbyid, $hostedbyname,
85
                        $collectedfromid, $collectedfromname, $originalid, $instanceURI, $dateOfCollection)" />
86

  
87
					<ROWS>
88
						<ROW key="{$resultId}" columnFamily="result">
89
							<QUALIFIER name="body" type="base64">
90
								<xsl:value-of select="$result" />
91
							</QUALIFIER>
92
						</ROW>
93
						<xsl:for-each select="//*[local-name() = 'creator']">
94
							<xsl:variable name="personIdTemp">
95
								<xsl:choose>
96
									<xsl:when
97
										test="string-length(./*[local-name() = 'nameIdentifier']) &gt; 0">
98
										<xsl:value-of
99
											select="translate(normalize-space(./*[local-name() = 'nameIdentifier']),' .,','___')" />
100
									</xsl:when>
101
									<xsl:otherwise>
102
										<xsl:value-of
103
											select="translate(normalize-space(./*[local-name() = 'creatorName']),' .,','___')" />
104
									</xsl:otherwise>
105
								</xsl:choose>
106
							</xsl:variable>
107
							<xsl:variable name="personId"
108
								select="dnet:oafId('person', $namespaceprefix, normalize-space($personIdTemp))" />
109

  
110
							<xsl:variable name="originalPersonId"
111
								select="./*[local-name() = 'nameIdentifier']" />
112
							<xsl:variable name="position" select="position()" />
113
							<xsl:if test="string-length($personId) &gt; 0">
114
								<xsl:variable name="person"
115
									select="dnet:oafPerson_FromDatacite($personId, $dataInfo, $collectedfromid, $collectedfromname,$originalPersonId, $dateOfCollection ,normalize-space(./*[local-name() = 'creatorName']))" />
116

  
117
								<xsl:variable name="personresult"
118
									select="dnet:oafPersonResult_Authorship_FromDatacite($personId, $resultId, $position, 'isAuthorOf', $dataInfo)" />
119
								<xsl:variable name="resultperson"
120
									select="dnet:oafPersonResult_Authorship_FromDatacite($resultId, $personId, $position, 'hasAuthor', $dataInfo)" />
121
								<ROW key="{$personId}" columnFamily="person">
122
									<QUALIFIER name="body" type="base64">
123
										<xsl:value-of select="$person" />
124
									</QUALIFIER>
125
								</ROW>
126
								<ROW key="{$personId}" columnFamily="personResult_authorship_isAuthorOf">
127
									<QUALIFIER name="{$resultId}" type="base64">
128
										<xsl:value-of select="$personresult" />
129
									</QUALIFIER>
130
								</ROW>
131
								<ROW key="{$resultId}" columnFamily="personResult_authorship_hasAuthor">
132
									<QUALIFIER name="{$personId}" type="base64">
133
										<xsl:value-of select="$resultperson" />
134
									</QUALIFIER>
135
								</ROW>
136
							</xsl:if>
137
						</xsl:for-each>
138

  
139
						<xsl:for-each select="//*[local-name()='projectid']">
140

  
141
							<xsl:variable name="projectId"
142
								select="dnet:oafSplitId('project', normalize-space(.))" />
143

  
144
							<xsl:variable name="resultproject"
145
								select="dnet:oafResultProject_Outcome_FromDatacite($resultId, $projectId, 'isProducedBy', $dataInfo)" />
146
							<xsl:variable name="projectresult"
147
								select="dnet:oafResultProject_Outcome_FromDatacite($projectId, $resultId, 'produces', $dataInfo)" />
148

  
149
							<xsl:if test="string-length($projectId) &gt; 0">
150
								<ROW key="{$resultId}" columnFamily="resultProject_outcome_isProducedBy">
151
									<QUALIFIER name="{$projectId}" type="base64">
152
										<xsl:value-of select="$resultproject" />
153
									</QUALIFIER>
154
								</ROW>
155
								<ROW key="{$projectId}" columnFamily="resultProject_outcome_produces">
156
									<QUALIFIER name="{$resultId}" type="base64">
157
										<xsl:value-of select="$projectresult" />
158
									</QUALIFIER>
159
								</ROW>
160
							</xsl:if>
161
						</xsl:for-each>
162

  
163
						<xsl:for-each select="//*[local-name()='relatedPublication']">
164

  
165
							<!-- relatedDataset ids must be in the openaire format  -->
166
							<xsl:variable name="publicationId" select="dnet:oafSimpleId('result', normalize-space(./@id))"/>
167

  
168
							<xsl:if test="string-length($publicationId) &gt; 0">
169

  
170
								<xsl:variable name="resultDataset" select="dnet:oafResultResult_PublicationDataset_FromDatacite($resultId, $publicationId, 'isRelatedTo', $dataInfo)"/>
171
								<xsl:variable name="datasetResult" select="dnet:oafResultResult_PublicationDataset_FromDatacite($publicationId, $resultId, 'isRelatedTo', $dataInfo)"/>
172

  
173
								<ROW key="{$resultId}" columnFamily="resultResult_publicationDataset_isRelatedTo">
174
									<QUALIFIER name="{$publicationId}" type="base64"><xsl:value-of select="$resultDataset"/></QUALIFIER>
175
								</ROW>
176
								<ROW key="{$publicationId}" columnFamily="resultResult_publicationDataset_isRelatedTo">
177
									<QUALIFIER name="{$resultId}" type="base64"><xsl:value-of select="$datasetResult"/></QUALIFIER>
178
								</ROW>
179
							</xsl:if>
180
						</xsl:for-each>
181
					</ROWS>
182
				</xsl:if>
183
			</xsl:otherwise>
184
		</xsl:choose>
185
	</xsl:template>
186
</xsl:stylesheet>
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/data/transform/simple.xsl
1
<?xml version="1.0" encoding="UTF-8"?>
2
<xsl:stylesheet version="1.0"
3
	xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
4
	xmlns:dnet="eu.dnetlib.data.transform.xml.AbstractDNetOafXsltFunctions"
5
	exclude-result-prefixes="xsl dnet">
6
	
7
    <xsl:output omit-xml-declaration="yes" indent="yes"/>
8
	<xsl:template match="/*">
9
		<xsl:variable name="metadata" select="//*[local-name()='metadata']/*" />
10
		<ROWS>
11
			<xsl:value-of select="dnet:testParse($metadata)"/>
12
		</ROWS>
13
	</xsl:template>
14
</xsl:stylesheet>
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/transform/ProtoDocumentMapperTest.java
1
package eu.dnetlib.data.transform;
2

  
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertNotNull;
5

  
6
import java.io.IOException;
7
import java.io.StringWriter;
8

  
9
import org.apache.commons.codec.binary.Base64;
10
import org.apache.commons.io.IOUtils;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.apache.solr.common.SolrInputDocument;
14
import org.apache.solr.common.SolrInputField;
15
import org.dom4j.DocumentException;
16
import org.junit.Before;
17
import org.junit.Test;
18

  
19
import com.google.protobuf.InvalidProtocolBufferException;
20
import com.googlecode.protobuf.format.JsonFormat;
21

  
22
import eu.dnetlib.data.mapreduce.util.OafTest;
23
import eu.dnetlib.data.proto.KindProtos.Kind;
24
import eu.dnetlib.data.proto.OafProtos.Oaf;
25
import eu.dnetlib.data.proto.OafProtos.OafEntity;
26
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
27

  
28
public class ProtoDocumentMapperTest {
29

  
30
	private static final Log log = LogFactory.getLog(ProtoDocumentMapperTest.class); // NOPMD by marko on 11/24/08 5:02 PM
31
	private String fields;
32

  
33
	@Before
34
	public void setUp() throws IOException {
35
		final StringWriter sw = new StringWriter();
36
		IOUtils.copy(getClass().getResourceAsStream("fields.xml"), sw);
37
		fields = sw.toString();
38
		assertNotNull(fields);
39
		assertFalse(fields.isEmpty());
40

  
41
		log.info(fields);
42
	}
43

  
44
	@Test
45
	public void testProto2SolrDocument() throws DocumentException, InvalidProtocolBufferException {
46
		final ProtoDocumentMapper mapper = new ProtoDocumentMapper(fields);
47

  
48
		assertNotNull(mapper);
49

  
50
		final OafEntity.Builder entity = OafTest.getResultBuilder("01");
51
		entity.addChildren(OafTest.getResultBuilder("01_children"));
52

  
53
		final Oaf oaf = OafTest.embed(entity.build(), Kind.entity).getOaf();
54

  
55
		assertNotNull(oaf.getEntity().getChildrenList());
56
		assertFalse(oaf.getEntity().getChildrenList().isEmpty());
57

  
58
		log.info("byte[] size: " + oaf.toByteArray().length);
59

  
60
		log.info("json size:   " + JsonFormat.printToString(oaf).length());
61

  
62
		log.info("base64 size: " + Base64.encodeBase64String(oaf.toByteArray()).length());
63

  
64
		final byte[] decodeBase64 = Base64.decodeBase64(Base64.encodeBase64String(oaf.toByteArray()));
65

  
66
		log.info("decoded: " + JsonFormat.printToString(Oaf.parseFrom(decodeBase64)));
67

  
68
		final SolrInputDocument doc = mapper.map(oaf, InputDocumentFactory.getParsedDateField("2015-02-15"), "asd");
69

  
70
		assertNotNull(doc);
71

  
72
		for (final SolrInputField f : doc.values()) {
73
			log.info(f);
74
		}
75
	}
76
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
1
pace.conf { 
2
	conditions { },
3
	model { 
4
		title { algo = JaroWinkler, type = String, weight = 0.5, ignoreMissing = false, path = result/metadata/title/value }
5
	}
6
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java
1
package eu.dnetlib.pace.distance;
2

  
3
import static org.junit.Assert.assertTrue;
4

  
5
import java.util.List;
6

  
7
import org.junit.Test;
8

  
9
import com.google.common.collect.Lists;
10

  
11
import eu.dnetlib.pace.AbstractProtoPaceTest;
12
import eu.dnetlib.pace.config.Config;
13
import eu.dnetlib.pace.model.MapDocument;
14

  
15
public class DetectorTest extends AbstractProtoPaceTest {
16

  
17
	@Test
18
	public void testDistanceResultSimple() {
19
		final Config config = getResultSimpleConf();
20

  
21
		final MapDocument resA = result(config, "A", "Recent results from CDF");
22
		final MapDocument resB = result(config, "B", "Recent results from CDF");
23

  
24
		final double d = new PaceDocumentDistance().between(resA, resB, config);
25
		System.out.println(String.format(" d ---> %s", d));
26

  
27
		assertTrue(d == 1.0);
28
	}
29

  
30
	@Test
31
	public void testDistanceResultSimpleMissingDates() {
32
		final Config config = getResultSimpleConf();
33

  
34
		final MapDocument resA = result(config, "A", "Recent results from BES");
35
		final MapDocument resB = result(config, "A", "Recent results from CES");
36

  
37
		final double d = new PaceDocumentDistance().between(resA, resB, config);
38
		System.out.println(String.format(" d ---> %s", d));
39

  
40
		assertTrue(d > 0.97);
41
	}
42

  
43
	@Test
44
	public void testDistanceResultInvalidDate() {
45
		final Config config = getResultConf();
46

  
47
		final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
48
		final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
49

  
50
		final double d = new PaceDocumentDistance().between(resA, resB, config);
51
		System.out.println(String.format(" d ---> %s", d));
52

  
53
		assertTrue(d == 1.0);
54
	}
55

  
56
	@Test
57
	public void testDistanceResultMissingOneDate() {
58
		final Config config = getResultConf();
59

  
60
		final MapDocument resA = result(config, "A", "title title title 6BESR", null);
61
		final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
62

  
63
		final double d = new PaceDocumentDistance().between(resA, resB, config);
64
		System.out.println(String.format(" d ---> %s", d));
65

  
66
		assertTrue((d > 0.9) && (d < 1.0));
67
	}
68

  
69
	@Test
70
	public void testDistanceResult() {
71
		final Config config = getResultConf();
72

  
73
		final MapDocument resA = result(config, "A", "title title title BES", "");
74
		final MapDocument resB = result(config, "B", "title title title CLEO");
75

  
76
		final double d = new PaceDocumentDistance().between(resA, resB, config);
77
		System.out.println(String.format(" d ---> %s", d));
78

  
79
		// assertTrue(d > 0.9 && d < 1.0);
80
	}
81

  
82
	@Test
83
	public void testDistanceResultMissingTwoDate() {
84
		final Config config = getResultConf();
85

  
86
		final MapDocument resA = result(config, "A", "title title title 6BESR");
87
		final MapDocument resB = result(config, "B", "title title title 6CLER");
88

  
89
		final double d = new PaceDocumentDistance().between(resA, resB, config);
90

  
91
		System.out.println(String.format(" d ---> %s", d));
92

  
93
		assertTrue((d > 0.9) && (d < 1.0));
94
	}
95

  
96
	@Test
97
	public void testDistanceOrganizationIgnoreMissing() {
98

  
99
		final Config config = getOrganizationSimpleConf();
100

  
101
		final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
102
		final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
103

  
104
		final double d = new PaceDocumentDistance().between(orgA, orgB, config);
105
		System.out.println(String.format(" d ---> %s", d));
106

  
107
		assertTrue(d == 1.0);
108
	}
109

  
110
	@Test
111
	public void testDistanceResultCase1() {
112

  
113
		final Config config = getResultConf();
114

  
115
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
116
		final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
117

  
118
		final double d = new PaceDocumentDistance().between(resA, resB, config);
119
		System.out.println(String.format(" d ---> %s", d));
120

  
121
		assertTrue((d > 0.9) && (d < 1.0));
122
	}
123

  
124
	@Test
125
	public void testDistanceResultCaseDoiMatch1() {
126
		final Config config = getResultConf();
127

  
128
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "http://dx.doi.org/10.1594/PANGAEA.726855");
129
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
130

  
131
		final double d = new PaceDocumentDistance().between(resA, resB, config);
132
		System.out.println(String.format(" d ---> %s", d));
133

  
134
		assertTrue("exact DOIs will produce an exact match", d == 1.0);
135
	}
136

  
137
	@Test
138
	public void testDistanceResultCaseDoiMatch2() {
139
		final Config config = getResultConf();
140

  
141
		final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "http://dx.doi.org/10.1594/PANGAEA.726855");
142
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "doi:10.1594/PANGAEA.726855");
143

  
144
		final double d = new PaceDocumentDistance().between(resA, resB, config);
145
		System.out.println(String.format(" d ---> %s", d));
146

  
147
		assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
148
	}
149

  
150
	@Test
151
	public void testDistanceResultCaseDoiMatch3() {
152
		final Config config = getResultConf();
153

  
154
		final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
155
		final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
156

  
157
		final double d = new PaceDocumentDistance().between(resA, resB, config);
158
		System.out.println(String.format(" d ---> %s", d));
159

  
160
		assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
161
	}
162

  
163
	@Test
164
	public void testDistanceResultCaseDoiMatch4() {
165
		final Config config = getResultConf();
166

  
167
		final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
168
		final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
169

  
170
		final double d = new PaceDocumentDistance().between(resA, resB, config);
171
		System.out.println(String.format(" d ---> %s", d));
172

  
173
		assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
174
	}
175

  
176
	@Test
177
	public void testDistanceResultCaseDoiMatch5() {
178

  
179
		final Config config = getResultConf();
180

  
181
		final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
182
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
183

  
184
		final double d = new PaceDocumentDistance().between(resA, resB, config);
185
		System.out.println(String.format(" d ---> %s", d));
186

  
187
		assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
188
	}
189

  
190
	@Test
191
	public void testDistanceResultCaseDoiMatch6() {
192
		final Config config = getResultConf();
193

  
194
		final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
195
		final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
196

  
197
		final double d = new PaceDocumentDistance().between(resA, resB, config);
198
		System.out.println(String.format(" d ---> %s", d));
199

  
200
		assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d == 0.0);
201
	}
202

  
203
	// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
204

  
205
	@Test
206
	public void testDistanceResultCaseAuthor1() {
207

  
208
		final Config config = getResultAuthorsConf();
209

  
210
		final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
211
		final List<String> authorsB = Lists.newArrayList("a", "b", "c");
212

  
213
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA);
214
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB);
215

  
216
		final double d = new PaceDocumentDistance().between(resA, resB, config);
217
		System.out.println(String.format(" d ---> %s", d));
218

  
219
		assertTrue(d == 0.0);
220
	}
221

  
222
	@Test
223
	public void testDistanceResultCaseAuthor2() {
224

  
225
		final Config config = getResultAuthorsConf();
226

  
227
		final List<String> authorsA = Lists.newArrayList("a", "b", "c");
228
		final List<String> authorsB = Lists.newArrayList("a", "b", "c");
229

  
230
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA);
231
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB);
232

  
233
		final double d = new PaceDocumentDistance().between(resA, resB, config);
234
		System.out.println(String.format(" d ---> %s", d));
235

  
236
		assertTrue(d == 1.0);
237
	}
238

  
239
	@Test
240
	public void testDistanceResultCaseAuthor3() {
241

  
242
		final Config config = getResultAuthorsConf();
243

  
244
		final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
245
		final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
246

  
247
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA);
248
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB);
249

  
250
		final double d = new PaceDocumentDistance().between(resA, resB, config);
251
		System.out.println(String.format(" d ---> %s", d));
252

  
253
		assertTrue((d > 0.9) && (d < 1.0));
254
	}
255

  
256
	@Test
257
	public void testDistanceResultCaseAuthor4() {
258

  
259
		final Config config = getResultAuthorsConf();
260

  
261
		final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
262
		final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
263

  
264
		final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA);
265
		final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB);
266

  
267
		final double d = new PaceDocumentDistance().between(resA, resB, config);
268
		System.out.println(String.format(" d ---> %s", d));
269

  
270
		// assertTrue(d == 0.0);
271
	}
272

  
273
	@Test
274
	public void testDistanceResultFullConf() {
275

  
276
		final Config config = getResultFullConf();
277

  
278
		final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
279
		final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
280

  
281
		final MapDocument resA =
282
				result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
283
						"10.1186/1752-1947-4-299", authorsA);
284
		final MapDocument resB =
285
				result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", null,
286
						authorsB);
287

  
288
		final double d = new PaceDocumentDistance().between(resA, resB, config);
289
		System.out.println(String.format(" d ---> %s", d));
290

  
291
		// assertTrue(d == 0.0);
292
	}
293

  
294
	@Test
295
	public void testDistanceStdl1() {
296

  
297
		final Config config = getResultStdlConf();
298

  
299
		final List<String> authorsA = Lists.newArrayList();
300
		final List<String> authorsB =
301
				Lists.newArrayList("Giovannelli A.", "Traversi M. L.", "Sebastiani L.", "Tognetti R.", "d?Andria R.", "Morelli G.", "Fragnito F.", "Lavini A.");
302

  
303
		final MapDocument resA =
304
				result(config,
305
						"A",
306
						"Evaluating water use strategies in olive trees grown under different irrigation regimes through integrated approaches between sap flow and high resolution stem growth analysis",
307
						"2008-01-01",
308
						null, authorsA);
309
		final MapDocument resB =
310
				result(config,
311
						"B",
312
						"Evaluating water use strategies in olive trees grown under different irrigation regimes through integrated approaches between sap flow and high resolution stem growth analysis",
313
						"2008-01-01",
314
						null,
315
						authorsB);
316

  
317
		final double d = new PaceDocumentDistance().between(resA, resB, config);
318
		System.out.println(String.format(" d ---> %s", d));
319

  
320
		// assertTrue(d == 0.0);
321
	}
322

  
323
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
1
pace.conf { 
2
	clustering { 
3
		acronyms { fields = [title, desc], params = { max = 1, minLen = 2, maxLen = 4} }, 
4
		ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, 
5
		suffixprefix { fields = [title], params = { max = 1, len = 3 } }
6
	}, 
7
	conditions { 
8
		titleVersionMatch { fields = [title] },
9
		sizeMatch { fields = [authors] }  
10
	},
11
	model { 
12
		title { algo = JaroWinkler, type = String, weight = 0.5, ignoreMissing = false, path = result/metadata/title/value },
13
		authors { algo = SortedLevel2JaroWinkler, type = List, weight = 0.5, ignoreMissing = true, path = result/author/metadata/fullname/value } 		
14
	}, 
15
	blacklists = {
16
		title = [
17
			"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
18
			"^(Kiri Karl Morgensternile).*$",
19
			"^(\\[Eksliibris Aleksandr).*\\]$",
20
			"^(\\[Eksliibris Aleksandr).*$",
21
			"^(Eksliibris Aleksandr).*$",
22
			"^(Kiri A\\. de Vignolles).*$",
23
			"^(2 kirja Karl Morgensternile).*$",
24
			"^(Pirita kloostri idaosa arheoloogilised).*$",
25
			"^(Kiri tundmatule).*$",
26
			"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
27
			"^(Eksliibris Nikolai Birukovile).*$",
28
			"^(Eksliibris Nikolai Issakovile).*$",
29
			"^(WHP Cruise Summary Information of section).*$",
30
			"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
31
			"^(Measurement of the spin\\-dependent structure function).*"
32
		] }
33
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import org.junit.Test;
4

  
5
import eu.dnetlib.pace.AbstractProtoPaceTest;
6
import eu.dnetlib.pace.config.Config;
7

  
8
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
9

  
10
	@Test
11
	public void test1() {
12

  
13
		String id = "12345";
14

  
15
		Config config = getResultConf();
16

  
17
		MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.fields());
18

  
19
		System.out.println(document);
20

  
21
		String stringDoc = MapDocumentSerializer.toString(document);
22

  
23
		System.out.println(stringDoc);
24

  
25
		MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
26

  
27
		System.out.println(decoded);
28
	}
29

  
30
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/OafEntityMerger.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.util.List;
4
import java.util.Map;
5
import java.util.Set;
6

  
7
import com.google.common.base.Predicate;
8
import com.google.common.collect.Iterables;
9
import com.google.common.collect.Lists;
10
import com.google.common.collect.Maps;
11
import com.google.common.collect.Sets;
12
import com.google.protobuf.Descriptors.FieldDescriptor;
13
import com.google.protobuf.Message.Builder;
14

  
15
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
16
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
17
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
18
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
19
import eu.dnetlib.data.proto.KindProtos.Kind;
20
import eu.dnetlib.data.proto.OafProtos.Oaf;
21
import eu.dnetlib.data.proto.OafProtos.OafEntity;
22
import eu.dnetlib.data.proto.PersonProtos.Person;
23
import eu.dnetlib.data.proto.ResultProtos.Result;
24
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust;
25
import eu.dnetlib.pace.util.DedupConfig;
26

  
27
public class OafEntityMerger {
28

  
29
	private static final String DEDUP_CLASSID = "sysimport:dedup";
30

  
31
	private static final String DNET_PROVENANCE_SCHEME = "dnet:provenanceActions";
32

  
33
	private final Predicate<StringField> skipEmptyStringField = new Predicate<StringField>() {
34

  
35
		@Override
36
		public boolean apply(final StringField s) {
37
			return (s != null) && (s.getValue() != null) && !s.getValue().isEmpty();
38
		}
39
	};
40

  
41
	private final Predicate<String> skipEmptyString = new Predicate<String>() {
42

  
43
		@Override
44
		public boolean apply(final String s) {
45
			return (s != null) && !s.isEmpty();
46
		}
47
	};
48

  
49
	public static Oaf.Builder merge(final String id, final Iterable<Oaf> entities) {
50
		return merge(null, id, entities);
51
	}
52

  
53
	public static Oaf.Builder merge(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) {
54
		return new OafEntityMerger().mergeEntities(dedupConf, id, entities);
55
	}
56

  
57
	public static Oaf.Builder merge(final Oaf.Builder builder) {
58
		return new OafEntityMerger().doMergeEntities(builder);
59
	}
60

  
61
	public Oaf.Builder mergeEntities(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) {
62

  
63
		Oaf.Builder builder = Oaf.newBuilder();
64
		String trust = "0.0";
65
		for (final Oaf oaf : TrustOrdering.sort(entities)) {
66
			// doublecheck we're dealing only with main entities
67
			if (!oaf.getKind().equals(Kind.entity)) throw new IllegalArgumentException("expected OafEntity!");
68

  
69
			final String currentTrust = oaf.getDataInfo().getTrust();
70
			if (!currentTrust.equals(SpecialTrust.NEUTRAL.toString())) {
71
				trust = currentTrust;
72
			}
73
			builder.mergeFrom(oaf);
74
		}
75

  
76
		builder = doMergeEntities(builder);
77
		builder.getEntityBuilder().setId(id);
78
		builder.getDataInfoBuilder()
79
				.setInferred(true)
80
				.setDeletedbyinference(false)
81
				.setTrust(trust)
82
				.setProvenanceaction(getProvenanceAction());
83

  
84
		if ((dedupConf != null) && dedupConf.isIncludeChildren()) {
85
			for (final Oaf oaf : Iterables.limit(entities, dedupConf.getMaxChildren())) {
86
				builder.getEntityBuilder().addChildren(oaf.getEntity());
87
			}
88
		}
89

  
90
		return builder;
91
	}
92

  
93
	private Qualifier.Builder getProvenanceAction() {
94
		return Qualifier.newBuilder().setClassid(DEDUP_CLASSID).setClassname(DEDUP_CLASSID).setSchemeid(DNET_PROVENANCE_SCHEME)
95
				.setSchemename(DNET_PROVENANCE_SCHEME);
96
	}
97

  
98
	public Oaf.Builder doMergeEntities(final Oaf.Builder builder) {
99

  
100
		switch (builder.getEntity().getType()) {
101
		case datasource:
102
			break;
103
		case organization:
104
			break;
105
		case person:
106
			final Person.Metadata.Builder person = builder.getEntityBuilder().getPersonBuilder().getMetadataBuilder();
107
			for (final String field : Lists.newArrayList("secondnames")) {
108
				setSingleString(person, field);
109
			}
110
			break;
111
		case project:
112
			break;
113
		case result:
114
			final Result.Metadata.Builder result = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
115
			setTitle(result);
116

  
117
			// for (String field : Lists.newArrayList("subject", "relevantdate")) {
118
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SUBJECT_FIELD_NUMBER,
119
					Result.Metadata.RELEVANTDATE_FIELD_NUMBER)) {
120
				setStructuredProperty(result, field);
121
			}
122
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.DESCRIPTION_FIELD_NUMBER)) {
123
				setLongestStringField(result, field);
124
			}
125
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SOURCE_FIELD_NUMBER)) {
126
				setUniqueStringField(result, field);
127
			}
128
			for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.COLLECTEDFROM_FIELD_NUMBER)) {
129
				setKeyValues(builder.getEntityBuilder(), field);
130
			}
131
			for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.PID_FIELD_NUMBER)) {
132
				setStructuredProperty(builder.getEntityBuilder(), field);
133
			}
134
			for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.ORIGINALID_FIELD_NUMBER)) {
135
				setUniqueString(builder.getEntityBuilder(), field);
136
			}
137

  
138
			// remove the inner authors, rely on the children
139
			builder.getEntityBuilder().getResultBuilder().clearAuthor();
140
			break;
141
		default:
142
			break;
143
		}
144
		return builder;
145
	}
146

  
147
	/**
148
	 * Helper method, avoid duplicated StructuredProperties in the given builder for the given fieldName
149
	 *
150
	 * @param builder
151
	 * @param fieldName
152
	 */
153
	@SuppressWarnings("unchecked")
154
	private void setStructuredProperty(final Builder builder, final String fieldName) {
155
		final Map<String, StructuredProperty> map = Maps.newHashMap();
156
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
157
		final List<StructuredProperty> sps = (List<StructuredProperty>) builder.getField(fd);
158

  
159
		if ((sps != null) && !sps.isEmpty()) {
160
			for (final StructuredProperty sp : sps) {
161
				map.put(sp.getValue(), sp);
162
			}
163

  
164
			if (!map.isEmpty()) {
165
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
166
			}
167
		}
168
	}
169

  
170
	/**
171
	 * Helper method, avoid duplicated KeyValues in the given builder for the given fieldName
172
	 *
173
	 * @param builder
174
	 * @param fieldName
175
	 */
176
	@SuppressWarnings("unchecked")
177
	private void setKeyValues(final Builder builder, final String fieldName) {
178
		final Map<String, KeyValue> map = Maps.newHashMap();
179
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
180
		final List<KeyValue> kvs = (List<KeyValue>) builder.getField(fd);
181

  
182
		if ((kvs != null) && !kvs.isEmpty()) {
183
			for (final KeyValue sp : kvs) {
184
				map.put(sp.getKey(), sp);
185
			}
186

  
187
			if (!map.isEmpty()) {
188
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
189
			}
190
		}
191
	}
192

  
193
	@SuppressWarnings("unchecked")
194
	private void setSingleString(final Builder builder, final String fieldName) {
195

  
196
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
197
		final List<StringField> field = (List<StringField>) builder.getField(fd);
198
		if ((field != null) && !field.isEmpty()) {
199
			final StringField s = (StringField) Iterables.getLast(Iterables.filter(field, skipEmptyStringField), "");
200

  
201
			if ((s != null) && (s.getValue() != null) && !s.getValue().isEmpty()) {
202
				builder.clearField(fd).setField(fd, Lists.newArrayList(s));
203
			}
204
		}
205
	}
206

  
207
	@SuppressWarnings("unchecked")
208
	private void setLongestStringField(final Builder builder, final String fieldName) {
209

  
210
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
211
		final List<StringField> field = (List<StringField>) builder.getField(fd);
212

  
213
		if ((field != null) && !field.isEmpty()) {
214
			final StringField.Builder max = StringField.newBuilder().setValue("");
215
			int maxLength = 0;
216
			for (final StringField sf : field) {
217
				if (sf.getValue().length() > maxLength) {
218
					maxLength = sf.getValue().length();
219
					max.clear();
220
					max.mergeFrom(sf);
221
				}
222
			}
223

  
224
			builder.clearField(fd).setField(fd, Lists.newArrayList(max.build()));
225
		}
226
	}
227

  
228
	@SuppressWarnings("unchecked")
229
	private void setUniqueStringField(final Builder builder, final String fieldName) {
230

  
231
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
232
		final List<StringField> field = (List<StringField>) builder.getField(fd);
233
		final Map<String, StringField> map = Maps.newHashMap();
234
		if ((field != null) && !field.isEmpty()) {
235
			for (final StringField s : Iterables.filter(field, skipEmptyStringField)) {
236
				map.put(s.getValue(), s);
237
			}
238

  
239
			builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
240
		}
241
	}
242

  
243
	@SuppressWarnings("unchecked")
244
	private void setUniqueString(final Builder builder, final String fieldName) {
245

  
246
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
247
		final List<String> field = (List<String>) builder.getField(fd);
248
		final Set<String> set = Sets.newHashSet();
249
		if ((field != null) && !field.isEmpty()) {
250
			for (final String s : Iterables.filter(field, skipEmptyString)) {
251
				set.add(s);
252
			}
253

  
254
			builder.clearField(fd).setField(fd, Lists.newArrayList(set));
255
		}
256
	}
257

  
258
	private void setTitle(final Result.Metadata.Builder metadata) {
259
		final Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), OafUtils.mainTitleFilter());
260

  
261
		if (!Iterables.isEmpty(filtered)) {
262
			metadata.clearTitle().addTitle(Iterables.getLast(filtered));
263
		}
264
	}
265

  
266
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/ProtoDocumentMapper.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.io.StringReader;
4

  
5
import org.apache.commons.codec.binary.Base64;
6
import org.apache.commons.lang.StringUtils;
7
import org.apache.solr.common.SolrInputDocument;
8
import org.dom4j.Document;
9
import org.dom4j.DocumentException;
10
import org.dom4j.Element;
11
import org.dom4j.io.SAXReader;
12

  
13
import com.google.common.base.Splitter;
14
import com.google.common.collect.Lists;
15
import com.google.protobuf.GeneratedMessage;
16

  
17
/**
18
 * The Class ProtoDocumentMapper.
19
 */
20
public class ProtoDocumentMapper extends AbstractProtoMapper {
21

  
22
	/** The fields. */
23
	private Document fields;
24

  
25
	/**
26
	 * Instantiates a new proto document mapper.
27
	 *
28
	 * @param fields
29
	 *            the fields
30
	 * @throws DocumentException
31
	 *             the document exception
32
	 */
33
	public ProtoDocumentMapper(final String fields) throws DocumentException {
34
		this.fields = parse(fields);
35

  
36
		if (StringUtils.isBlank(this.fields.valueOf("//FIELD[@name = 'objIdentifier']/@name")))
37
			throw new IllegalArgumentException("field objIdentifier is mandatory");
38
	}
39

  
40
	/**
41
	 * Map.
42
	 *
43
	 * @param proto
44
	 *            the proto
45
	 * @param version
46
	 *            the version
47
	 * @param dsId
48
	 *            the ds id
49
	 * @return the solr input document
50
	 * @throws DocumentException
51
	 *             the document exception
52
	 */
53
	public SolrInputDocument map(final GeneratedMessage proto, final String version, final String dsId) throws DocumentException {
54

  
55
		final SolrInputDocument doc = new SolrInputDocument();
56

  
57
		for (final Object o : fields.selectNodes("//FIELD")) {
58
			final Element e = (Element) o;
59

  
60
			final String name = e.attribute("name").getValue().toLowerCase().trim();
61
			final String path = e.attribute("path").getValue();
62

  
63
			doc.setField(name, processMultiPath(proto, Lists.newLinkedList(Splitter.on("|").trimResults().split(path))));
64
		}
65

  
66
		doc.setField("__dsid", dsId);
67
		doc.setField("__dsversion", version);
68
		doc.setField("objidentifier", patchId((String) doc.getFieldValue("objidentifier")));
69
		doc.setField("__indexrecordidentifier", doc.getFieldValue("objidentifier"));
70
		doc.setField("__result", Base64.encodeBase64String(proto.toByteArray()));
71

  
72
		return doc;
73
	}
74

  
75
	/**
76
	 * Patch the objidentifier: when it comes from HBase, i.e. contains the separator '|' returns the string that follows.
77
	 *
78
	 * @param objidentifier
79
	 *            the objidentifier
80
	 * @return the string
81
	 */
82
	private String patchId(final String objidentifier) {
83
		return objidentifier.contains("|") ? StringUtils.substringAfter(objidentifier, "|") : objidentifier;
84
	}
85

  
86
	/**
87
	 * Parses the.
88
	 *
89
	 * @param s
90
	 *            the s
91
	 * @return the document
92
	 * @throws DocumentException
93
	 *             the document exception
94
	 */
95
	private Document parse(final String s) throws DocumentException {
96
		return new SAXReader().read(new StringReader(s));
97
	}
98

  
99
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-hadoop-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-openaireplus-mapping-utils</artifactId>
12
	<packaging>jar</packaging>
13
	<version>3.0.7-SNAPSHOT</version>
14
	<scm>
15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-openaireplus-mapping-utils/trunk</developerConnection>
16
	</scm>
17
	<dependencies>
18
		<dependency>
19
			<groupId>com.google.guava</groupId>
20
			<artifactId>guava</artifactId>
21
			<version>${google.guava.version}</version>
22
		</dependency>		
23
		<dependency>
24
			<groupId>junit</groupId>
25
			<artifactId>junit</artifactId>
26
			<version>${junit.version}</version>
27
			<scope>test</scope>
28
		</dependency>
29
		<dependency>
30
			<groupId>commons-codec</groupId>
31
			<artifactId>commons-codec</artifactId>
32
			<version>${commons.codec.version}</version>
33
		</dependency>
34
		<dependency>
35
			<groupId>dom4j</groupId>
36
			<artifactId>dom4j</artifactId>
37
			<version>${dom4j.version}</version>
38
		</dependency>
39
		<dependency>
40
			<groupId>eu.dnetlib</groupId>
41
			<artifactId>dnet-openaire-data-protos</artifactId>
42
			<version>[3.0.0,4.0.0)</version>
43
		</dependency>
44
		<dependency>
45
			<groupId>eu.dnetlib</groupId>
46
			<artifactId>dnet-pace-core</artifactId>
47
			<version>[1.3.0,2.0.0)</version>
48
		</dependency>
49
		<dependency>
50
			<groupId>eu.dnetlib</groupId>
51
			<artifactId>cnr-misc-utils</artifactId>
52
			<version>[1.0.0,2.0.0)</version>
53
		</dependency>
54
		<dependency>
55
			<groupId>eu.dnetlib</groupId>
56
			<artifactId>dnet-hadoop-commons</artifactId>
57
			<version>[1.0.0,2.0.0)</version>
58
		</dependency>
59
		<dependency>
60
			<groupId>eu.dnetlib</groupId>
61
			<artifactId>dnet-index-solr-common</artifactId>
62
			<version>[1.0.0,2.0.0)</version>
63
		</dependency>	
64
		<dependency>
65
			<groupId>com.googlecode.protobuf-java-format</groupId>
66
			<artifactId>protobuf-java-format</artifactId>
67
			<version>1.2</version>
68
			<scope>test</scope>
69
		</dependency>		
70
	</dependencies>
71
</project>
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/mapreduce/util/OafRelDecoderTest.java
1
package eu.dnetlib.data.mapreduce.util;
2

  
3
import static org.junit.Assert.assertEquals;
4
import static org.junit.Assert.assertNotNull;
5

  
6
import org.junit.Before;
7
import org.junit.Test;
8

  
9
import com.google.protobuf.Descriptors.FieldDescriptor;
10

  
11
import eu.dnetlib.data.proto.OafProtos.OafRel;
12
import eu.dnetlib.data.proto.PersonResultProtos.PersonResult.Authorship;
13
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
14
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
15

  
16
public class OafRelDecoderTest {
17

  
18
	private OafRel oafRel;
19

  
20
	@Before
21
	public void setUp() {
22
		oafRel = OafTest.getPersonResult("ID_1", "ID_2", "1", "isAuthor");
23
	}
24

  
25
	@Test
26
	public void testSetClass() {
27

  
28
		OafRelDecoder d1 = OafRelDecoder.decode(oafRel);
29

  
30
		assertNotNull(d1);
31
		assertEquals("isAuthor", d1.getRelClass());
32

  
33
		OafRelDecoder d2 = OafRelDecoder.decode(d1.setClassId("hasAuthor").build());
34

  
35
		assertEquals("hasAuthor", d2.getRelClass());
36
		assertEquals("hasAuthor", d2.getRelMetadata().getSemantics().getClassid());
37
		assertEquals("hasAuthor", d2.getRelMetadata().getSemantics().getClassname());
38

  
39
		FieldDescriptor fd = Authorship.getDescriptor().findFieldByName("ranking");
40
		assertEquals(d1.getSubRel().getField(fd), d2.getSubRel().getField(fd));
41
	}
42

  
43
	@Test
44
	public void testGetCF() {
45
		assertEquals("personResult_authorship_isAuthorOf", OafRelDecoder.getCFQ(RelType.personResult, SubRelType.authorship, Authorship.RelName.isAuthorOf));
46
		assertEquals("personResult_authorship_isAuthorOf", OafRelDecoder.getCFQ(RelType.personResult, SubRelType.authorship, "isAuthorOf"));
47
	}
48

  
49
}
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.util.List;
4

  
5
import org.apache.commons.lang.StringUtils;
6

  
7
import com.google.common.base.Splitter;
8
import com.google.common.collect.Lists;
9
import com.google.protobuf.Descriptors.EnumValueDescriptor;
10
import com.google.protobuf.Descriptors.FieldDescriptor;
11
import com.google.protobuf.GeneratedMessage;
12

  
13
/**
14
 * AbstractProtoMapper provide common navigation methods on the protocolbuffers Messages.
15
 *
16
 * @author claudio
17
 */
18
public abstract class AbstractProtoMapper {
19

  
20
	/** The Constant PATH_SEPARATOR. */
21
	private static final String PATH_SEPARATOR = "/";
22

  
23
	/**
24
	 * Process multi path.
25
	 *
26
	 * @param proto
27
	 *            the proto
28
	 * @param paths
29
	 *            the paths
30
	 * @return the list
31
	 */
32
	protected List<Object> processMultiPath(final GeneratedMessage proto, final List<String> paths) {
33
		final List<Object> response = Lists.newArrayList();
34
		for (final String pathElements : paths) {
35
			response.addAll(processPath(proto, pathElements));
36
		}
37
		return response;
38
	}
39

  
40
	/**
41
	 * Process path.
42
	 *
43
	 * @param proto
44
	 *            the proto
45
	 * @param path
46
	 *            the path
47
	 * @return the list
48
	 */
49
	protected List<Object> processPath(final GeneratedMessage proto, final String path) {
50
		return processPath(proto, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path)));
51
	}
52

  
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff