Changeset 2296


Ignore:
Timestamp:
Mar 3, 2009 1:40:56 AM (10 years ago)
Author:
hazmat
Message:

packaging fixes, odt transform in python, pdftohtml for osx testing

Location:
ore.transforms/trunk
Files:
6 edited
1 moved

Legend:

Unmodified
Added
Removed
  • ore.transforms/trunk/ore/transforms/common.py

    r2295 r2296  
    2929    from zope import component 
    3030    for transform in [ pdf_to_text, rtf_to_html, ppt_to_text, 
    31                        xls_to_text, word_to_html, odf_to_text ] 
     31                       xls_to_text, word_to_html, odf_to_text ]: 
    3232 
    3333        for i in transform.inputs: 
     
    3737                ) 
    3838 
    39 class pdf_to_text(transform.CommandTransform): 
     39class pdf_to_text(transform.StdoutTransform): 
    4040 
    4141    inputs   = ('application/pdf',) 
     
    4343    output_encoding = 'utf-8' 
    4444     
    45     binary = "pdftotext" 
    46     binaryArgs = " -enc UTF-8" 
     45    #binary = "pdftotext" 
     46    #binaryArgs = " -enc UTF-8" 
     47    binary = "pdftohtml"     
     48    binaryArgs = "-i -noframes -enc UTF-8 -stdout" 
    4749 
    4850class rtf_to_html(transform.CommandTransform): 
     
    9496        html_file.close() 
    9597 
    96 class odf_to_text( transform.CommandTransform ): 
     98class odf_to_text( transform.Transform ): 
    9799 
    98100    inputs = ('application/vnd.oasis.opendocument.text', 
     
    109111     
    110112    output = 'text/plain' 
    111     binary = "ooo_as_text" 
    112      
    113     def formatCommand( self, work_dir, source_copy, output_copy, **kw ): 
    114         cmd = 'cd "%s" && %s --output-file="%s" "%s" 2>error_log 1>/dev/null' % ( 
    115             work_dir, self.binary, output_copy, source_copy) 
    116         return cmd 
     113 
     114    def transform( self, input_stream, output_stream, **options ): 
     115        from ooopy.OOoPy import OOoPy 
     116        o = OOoPy( input_stream ) 
     117        content = o.read('content.xml') 
     118        self.as_text( content.getroot(), output_stream ) 
     119        output_stream.seek(0) 
     120 
     121    def as_text( self, node, out) : 
     122        if node.text is not None : 
     123            print >> out, node.text.encode ('utf-8'), 
     124        for subnode in node : 
     125            self.as_text(subnode, out) 
     126        if node.tail is not None : 
     127            print >> out, node.tail.encode ('utf-8'), 
     128 
  • ore.transforms/trunk/ore/transforms/context.py

    r2295 r2296  
    88    interface.implements( interfaces.ITransformContext ) 
    99     
    10     def __init__( self, source_file=None, source_mime_type=None, target_file=None, target_mime_type="text/plain") 
     10    def __init__( self, source_file=None, source_mime_type=None, target_file=None, target_mime_type="text/plain"): 
    1111        self.source_file = source_file 
    1212        self.source_mime_type = source_mime_type 
  • ore.transforms/trunk/ore/transforms/interfaces.py

    r2295 r2296  
    1212class ITransformContext( interface.Interface ): 
    1313 
    14     input_mime_type = schema.ASCIILine(__doc__ = u"Input Mime Type Name" ) 
    15     output_mime_type = schema.ASCIILine( __doc__ = u"Output Mime Type Name" ) 
     14    input_mime_type = schema.ASCIILine( title = u"Input Mime Type Name" ) 
     15    output_mime_type = schema.ASCIILine( title = u"Output Mime Type Name" ) 
    1616     
    17     input_stream = interface.IObject( schema=INamedFileStream ) 
    18     output_stream = interface.IObject( schema=INamedFileStream ) 
     17    input_stream = schema.Object( schema=INamedFileStream ) 
     18    output_stream = schema.Object( schema=INamedFileStream ) 
    1919 
    2020class ITransform( interface.Interface ): 
  • ore.transforms/trunk/ore/transforms/tests/test_transforms.py

    r2293 r2296  
    88""" 
    99 
    10 import unittest, os 
     10import unittest, os, inspect 
    1111 
    1212from ore.transforms import common 
     13 
     14package_path = os.path.join( os.path.dirname( inspect.getabsfile( common ) ), 'tests') 
     15 
     16def resourcepath( *parts ): 
     17    return os.path.join( package_path, *parts ) 
     18     
    1319 
    1420class test_transforms(unittest.TestCase): 
    1521 
    1622    def tearDown( self ): 
    17         if os.path.exists('data.txt'): 
    18             os.unlink('data.txt') 
     23        if os.path.exists(resourcepath('data.txt')): 
     24            os.unlink(resourcepath('data.txt')) 
    1925         
    2026    def xtest_word_text( self ): 
    21         fh = open('data.doc') 
    22         out = open('data.txt', 'w') 
     27        fh = open(resourcepath( 'data.doc') ) 
     28        out = open(resourcepath('data.txt'), 'w') 
    2329 
    2430        transformer = common.word_to_html() 
     
    2834 
    2935    def test_pdf_text( self ): 
    30         fh = open('data.pdf') 
    31         out = open('data.txt', 'w+') 
     36        fh = open( resourcepath('data.pdf') ) 
     37        out = open( resourcepath('data.txt'), 'w+') 
    3238        transformer = common.pdf_to_text() 
    3339        transformer.transform( fh, out ) 
     
    4046    def xtest_xls_text( self ): 
    4147        # can't really test this easily on dev platform 
    42         fh = open('data.xls') 
    43         out = open('data.txt', 'w') 
     48        fh = open( resourcepath('data.xls') ) 
     49        out = open( resourcepath('data.txt'), 'w') 
    4450 
    4551        transformer = common.pdf_to_text() 
    4652        transformer.transform( fh, out ) 
    47         value = out.read()         
     53        value = out.read() 
    4854 
     55    def test_odt_text( self ): 
     56        fh = open( resourcepath('data.odt') ) 
     57        out = open( resourcepath('data.txt'), 'w+')         
     58 
     59        transformer = common.odf_to_text() 
     60        transformer.transform( fh, out ) 
     61        value = out.read() 
     62        assert "No linguistic content" in value 
    4963         
     64def test_suite( ): 
     65 
     66    return unittest.makeSuite( test_transforms ) 
     67     
    5068         
    5169     
  • ore.transforms/trunk/ore/transforms/transform.py

    r2295 r2296  
    121121 
    122122    def formatCommand( self, work_dir, source_copy, output_copy, **kw ): 
    123         cmd = 'cd "%s" && %s %s "%s" > "%s" 2>error_log 1>/dev/null' % ( 
     123        cmd = 'cd "%s" && %s %s "%s" > "%s" 2>error_log' % ( 
    124124            work_dir, self.binary_path, self.binaryArgs, source_copy, output_copy) 
    125125        return cmd 
  • ore.transforms/trunk/setup.py

    r2295 r2296  
    2121    author='Kapil Thangavelu', 
    2222    author_email='kapil.foss@gmail.com', 
     23    dependency_links = ["http://downloads.sourceforge.net/ooopy/OOoPy-1.4.4873.tar.gz#egg=OOoPy"], 
    2324    description="Text Transformations for Common Document Formats", 
    2425    long_description=( 
Note: See TracChangeset for help on using the changeset viewer.