whisper/farsi_transcriber_web/backend/app.py

"""
Farsi Transcriber Backend API

Flask API for handling audio/video file transcription using Whisper model.
Configured for Railway deployment.
"""

import os
import sys
import tempfile
from pathlib import Path
from werkzeug.utils import secure_filename
import whisper
from flask import Flask, request, jsonify
from flask_cors import CORS

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

app = Flask(__name__)
CORS(app, resources={r"/api/*": {"origins": "*"}})

# Configuration
UPLOAD_FOLDER = tempfile.gettempdir()
ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'aac', 'wma', 'mp4', 'mkv', 'mov', 'webm', 'avi', 'flv', 'wmv'}
MAX_FILE_SIZE = 500 * 1024 * 1024  # 500MB

# Production settings
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE
app.config['ENV'] = os.getenv('FLASK_ENV', 'production')

# Load Whisper model (lazy load for faster startup)
model = None

def load_model():
    """Lazy load Whisper model on first use"""
    global model
    if model is None:
        try:
            print("Loading Whisper model...")
            model = whisper.load_model('medium')
            print("✓ Whisper model loaded successfully")
        except Exception as e:
            print(f"✗ Error loading Whisper model: {e}")
            model = None
    return model


def allowed_file(filename):
    """Check if file has allowed extension"""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


@app.route('/', methods=['GET'])
def index():
    """Root endpoint"""
    return jsonify({
        'message': 'Farsi Transcriber API',
        'version': '1.0.0',
        'status': 'running'
    })


@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    model_status = load_model()
    return jsonify({
        'status': 'healthy',
        'model_loaded': model_status is not None,
        'environment': app.config['ENV']
    })


@app.route('/transcribe', methods=['POST'])
def transcribe():
    """
    Transcribe audio/video file

    Request:
    - file: Audio/video file
    - language: Language code (default: 'fa' for Farsi)

    Response:
    - transcription results with segments and timestamps
    """
    try:
        # Load model if not already loaded
        whisper_model = load_model()
        if not whisper_model:
            return jsonify({'error': 'Failed to load Whisper model'}), 500

        # Check if file is in request
        if 'file' not in request.files:
            return jsonify({'error': 'No file provided'}), 400

        file = request.files['file']

        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400

        if not allowed_file(file.filename):
            return jsonify({'error': 'File type not allowed'}), 400

        # Save file
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        # Get language code from request (default: Farsi)
        language = request.form.get('language', 'fa')

        # Transcribe
        result = whisper_model.transcribe(filepath, language=language, verbose=False)

        # Format response
        segments = []
        for segment in result.get('segments', []):
            segments.append({
                'start': f"{int(segment['start'] // 3600):02d}:{int((segment['start'] % 3600) // 60):02d}:{int(segment['start'] % 60):02d}.{int((segment['start'] % 1) * 1000):03d}",
                'end': f"{int(segment['end'] // 3600):02d}:{int((segment['end'] % 3600) // 60):02d}:{int(segment['end'] % 60):02d}.{int((segment['end'] % 1) * 1000):03d}",
                'text': segment['text'].strip(),
            })

        # Clean up uploaded file
        try:
            os.remove(filepath)
        except:
            pass

        return jsonify({
            'status': 'success',
            'filename': filename,
            'language': result.get('language', 'unknown'),
            'text': result.get('text', ''),
            'segments': segments
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/models', methods=['GET'])
def get_models():
    """Get available Whisper models"""
    return jsonify({
        'available_models': ['tiny', 'base', 'small', 'medium', 'large'],
        'current_model': 'medium',
        'description': 'List of available Whisper models. Larger models are more accurate but slower.'
    })


@app.route('/export', methods=['POST'])
def export():
    """
    Export transcription in specified format

    Request:
    - transcription: Full transcription text
    - segments: Array of segments with timestamps
    - format: Export format (txt, srt, vtt, json)

    Response:
    - Exported file content
    """
    try:
        data = request.json
        transcription = data.get('transcription', '')
        segments = data.get('segments', [])
        format_type = data.get('format', 'txt').lower()

        if format_type == 'txt':
            content = transcription
            mime_type = 'text/plain'
        elif format_type == 'srt':
            content = _format_srt(segments)
            mime_type = 'text/plain'
        elif format_type == 'vtt':
            content = _format_vtt(segments)
            mime_type = 'text/plain'
        elif format_type == 'json':
            import json
            content = json.dumps({'text': transcription, 'segments': segments}, ensure_ascii=False, indent=2)
            mime_type = 'application/json'
        else:
            return jsonify({'error': 'Unsupported format'}), 400

        return jsonify({
            'status': 'success',
            'format': format_type,
            'content': content,
            'mime_type': mime_type
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


def _format_srt(segments):
    """Format transcription as SRT subtitle format"""
    lines = []
    for i, segment in enumerate(segments, 1):
        lines.append(str(i))
        lines.append(f"{segment['start']} --> {segment['end']}")
        lines.append(segment['text'])
        lines.append('')
    return '\n'.join(lines)


def _format_vtt(segments):
    """Format transcription as WebVTT subtitle format"""
    lines = ['WEBVTT', '']
    for segment in segments:
        lines.append(f"{segment['start']} --> {segment['end']}")
        lines.append(segment['text'])
        lines.append('')
    return '\n'.join(lines)


if __name__ == '__main__':
    port = int(os.getenv('PORT', 5000))
    debug = os.getenv('FLASK_ENV', 'production') == 'development'
    app.run(debug=debug, host='0.0.0.0', port=port, threaded=True)