@article(fahmy1995,
author = "Hossam A. H. Fahmy",
title  = "Design and Implementation of {AHD-2494}, a 24-bit {RISC} 
          Processor on a {VLSI} Chip",
journal= "`looking.forward' the {IEEE} Computer Society's Student 
          Newsletter",
year   = "1995",
note   = "(fall issue)",
ps     = "publish/ahd2494.ps.gz",
pdf    = "publish/ahd2494.pdf"
)

@article(fahmy1997,
author = "Hossam A. H. Fahmy and Khaled Ismail",
title  = "Analysis of a Single-electron Decimal Adder",
journal= "Applied Physics Letters",
year   = "1997",
month  = may,
volume = "70",
pages  = "2613--2615",
ps     = "publish/apl1997.ps.gz",
pdf    = "publish/apl1997.pdf"
)

@InProceedings(fahmy1999_1,
author = "Hossam A. H. Fahmy and Martin Morf and Richard Kiehl",
title  = "Potential Functionality of Multi-valued Tunneling Phase Logic 
          Devices",
bookTitle= "European Conference on Circuit Theory and Design, Stresa, 
            Italy,  Session {S10-II}",
year   = "1999",
month  = aug,
ps     = "publish/ecctd1999.ps.gz",
pdf    = "publish/ecctd1999.pdf"
)

@InProceedings(hung99,
author = "Patrick Hung and Hossam A. H. Fahmy and Oscar Mencer and
          Michael J. Flynn",
title  = "Fast division algorithm with a small lookup table",
bookTitle= "Thirty-Third Asilomar Conference on Signals, Systems, and
          Computers, Asilomar, California, {USA}",
year   = "1999",
month  = oct,
volume = "2",
pages  = "1465--1468",
ps     = "publish/asilomar1999.ps.gz",
pdf    = "publish/asilomar1999.pdf"
)

@InProceedings(fahmy1999_2,
author = "Hossam A. H. Fahmy and Richard Kiehl",
title  = "Complete Logic Family Using Tunneling-Phase-Logic Devices",
bookTitle= "The 11th International Conference on Microelectronics, 
            {ICM} 99, {K}uwait",
year   = "1999",
month  = nov,
ps     = "publish/icm1999.ps.gz",
pdf    = "publish/icm1999.pdf"
)


@InProceedings(fahmy01,
author = "Hossam A. H. Fahmy and Albert A. Liddicoat and Michael J. 
          Flynn",
title  = "Improving the Effectiveness of Floating Point Arithmetic",
bookTitle= "Thirty-Fifth Asilomar Conference on Signals, Systems, and
          Computers, Asilomar, California, {USA}",
year   = "2001",
month  = nov,
volume = "1",
pages  = "875--879",
ps     = "publish/asilomar2001.ps.gz",
pdf    = "publish/asilomar2001.pdf"
)


@InProceedings(fahmy02,
author = "Hossam A. H. Fahmy and Albert A. Liddicoat and Michael J. 
          Flynn",
title  = "Parametric time delay modeling for floating point units",
bookTitle= "The International Symposium on Optical Science and Technology,
   SPIE's 47th annual meeting (Arithmetic session), Seattle, 
   Washington, USA",
year   = "2002",
month  = jul,
ps     = "publish/spie2002.ps.gz",
pdf    = "publish/spie2002.pdf"
)


@InProceedings(fahmy2003_1,
author = "Hossam Aly Hassan Fahmy and Michael J. Flynn",
title  = "The Case For a Redundant Format in Floating Point Arithmetic",
bookTitle= "Proceedings of the 16th {IEEE} Symposium on Computer
            Arithmetic, Santiago de Compostela, {S}pain",
year   = "2003",
month  = jun,
ps     = "publish/arith2003.ps.gz",
pdf    = "publish/arith2003.pdf"
)

@InProceedings(fahmy2003_2,
author = "Hossam A. H. Fahmy and Michael J. Flynn",
title  = "Rounding in Redundant Digit Floating Point Systems",
bookTitle= "{The International Symposium on Optical Science and Technology,
   SPIE's 48th annual meeting (Arithmetic session), San Diego, 
   California, USA}",
year   = "2003",
month  = aug,
ps     = "publish/spie2003.ps.gz",
pdf    = "publish/spie2003.pdf"
)

@InProceedings(fahmy2004,
author = "Hossam A. H. Fahmy and Michael J. Flynn",
title  = "An adder for a redundant digit arithmetic unit",
bookTitle= "{COOL Chips VII, Yokohama, Japan}",
year   = "2004",
month  = apr,
ps     = "publish/cool2004.ps.gz",
pdf    = "publish/cool2004.pdf"
)


@InProceedings(fahmy2005_1,
author = "Yajuan He and Chip-Hong Chang and Jiangmin Gu and Hossam A. H. 
Fahmy",
title  = "A Novel Covalent Redundant Binary Booth Encoder",
bookTitle= "{The IEEE International Symposium on Circuits and Systems, 
(ISCAS), Kobe, Japan}",
year   = "2005",
month  = may,
pages   = "69--72",
ISBN   = "0-7803-8834-8",
ps     = "publish/iscas2005.ps.gz",
pdf    = "publish/iscas2005.pdf",
abstract ="The benefit of high radix Booth encoders in reducing the 
number of partial products in fast multipliers has been hampered by the 
complexity of generating the hard multiples. The use of redundant 
binary (RB) Booth encoder can overcome this problem and avoid the error 
compensation vector but at the cost of doubling the number of RB 
partial products. This paper presents a novel covalent RB Booth encoder 
to generate a compound RB partial product from two adjacent Booth 
encoded digits. The new encoder fully exploits the characteristics of 
Booth encoded numbers to restore the effective partial product 
reduction rate of RB Booth encoder while maintaining the simplicity of 
hard multiple generators and eliminating the constant correction 
vector. A legitimate comparison on an $8 \times 8$-bit RB multiplier 
prototype shows that the multiplier constructed with our proposed Booth 
encoder consumes lower power and computes faster than those with the 
normal binary and redundant binary Booth encoders."
)

@InProceedings(fahmy2005_2,
author = "Sherif Tawfik and Hossam A. H. Fahmy",
title  = "Error analysis of a powering method and a novel square root 
          algorithm",
bookTitle= "{The 17th IMACS World Congress Scientific Computation, 
            Applied Mathematics and Simulation, Paris, France}",
year   = "2005",
month  = jul,
ISBN   = "2-915913-02-1",
ps     = "publish/imacs2005.ps.gz",
pdf    = "publish/imacs2005.pdf",
url    = "http://sab1.sscc.ru/imacs2005/",
abstract ="This paper presents a complete error analysis for a novel
square root hardware implementation. The analysis includes the powering
method used for the initial approximation and the higher order
Newton-Raphson square root iterations. Both theoretical and algorithmic
error analysis are presented and compared. The algorithmic analysis
provides a more accurate error estimate which reduces the size of the
memory required in the initial approximation stage to less than half its
original size. ",
)


@InProceedings(fahmy:2006:atm,
author = "Sherif A. Tawfik and Hossam A. H. Fahmy",
title  = "Algorithmic Truncation of MiniMax Polynomial Coefficients",
bookTitle= "{The IEEE International Symposium on Circuits and Systems, 
(ISCAS), Kos, Greece}",
year   = "2006",
month  = may,
pages   = "2421--2424",
ISBN   = "",
ps     = "publish/iscas2006.ps.gz",
pdf    = "publish/iscas2006.pdf",
abstract ="Elementary and high-level functions can be computed
in hardware using polynomial approximation techniques.
There are many techniques in the literature to calculate the
coefficients of such polynomials. Remez algorithm [1] provides
the optimal polynomial in the Chebyshev sense that is minimizing
the maximum error (minimax approximation).
This paper presents an algorithm for truncating the coefficients
of the minimax polynomials obtained from Remez algorithm
using an algorithmic method. A gain of 3 and 4 bits of accuracy
over the direct rounding is reported.
Muller [2] addressed the same problem but his algorithm is
applicable for the second order polynomials only. This paper
presents an algorithm that is applicable for any order."
)


@InProceedings(fahmy:2006:tqc,
author = "Hossam A. H. Fahmy",
title  = "{Typesetting the Qur'an and its specific challenges to the \TeX\
family}",
booktitle =    "{Euro{\TeX}}~2006: Proceedings of the $16^{th}$ Annual
             Meeting of the European {\TeX} Users,
             {Debrecen}, {Hungary}",
year   = "2006",
month  = jul,
pages   = "",
ISBN   = "",
ps     = "publish/tqsct.ps.gz",
pdf    = "publish/tqsct.pdf",
url    = "http://www.matexhu.org/eurotex2006/",
abstract ="",
)

@InProceedings(fahmy:2006:qttat,
author = "Hossam A. H. Fahmy",
title  = "{AlQalam for typesetting traditional Arabic texts}",
booktitle = "{TUG}~2006: The Annual Meeting of the International {\TeX} 
Users 
               Group, {Marrakesh}, {Morocco}",
year   = "2006",
month  = nov,
pages   = "",
ISBN   = "",
ps     = "publish/qttat.ps.gz",
pdf    = "publish/qttat.pdf",
url    = "http://www.tug.org/tug2006/",
abstract ="",
)

@article(fahmy:2007:qttat,
author = "Hossam A. H. Fahmy",
title  = "{AlQalam for typesetting traditional Arabic texts}",
journal= "TUGboat",
year   = "2007",
month  = jan,
volume = "27",
number = "2",
pages   = "159--166",
ISBN   = "",
ps     = "publish/tb87fahmy.ps.gz",
pdf    = "publish/tb87fahmy.pdf",
url    = "http://www.tug.org/TUGboat/Contents/contents27-2.html",
note = "This paper groups the work already presented in
{Euro{\TeX}}~2006 and {TUG}~2006.",
abstract ="AlQalam (``the pen'' in Arabic) is our freely available
system intended for typesetting the Qur'an, other traditional texts,
and any publications in the languages using the Arabic script. From a
typographical point of view, the Qur'an is one of the most demanding
texts. However, there is a long historical record of excellent
quality materials (manuscripts and recent printings) to guide the work
on a system to typeset it. Such a system, once complete, can easily
typeset any work using the Arabic script, including those with mixed
languages.",

)

@InProceedings(smitha:2006:racle,
author = "Smitha K. Gopi and Hossam A. H. Fahmy and Vinod A. Prasad",
title  = "Redundant Adders Consume Less Energy",
bookTitle= "{The {IEEE} Asia-Pacific Conference on Circuits and Systems,
(APCCAS), Singapore}",
year   = "2006",
month  = dec,
pages   = "422--425",
ISBN   = "",
ps     = "publish/racle.ps.gz",
pdf    = "publish/racle.pdf",
abstract ="We conduct a complete analysis of the effect of digit
redundancy in adders on their delay, power, energy, and energy-delay
product. To our knowledge, this is the first such detailed analysis.
We discuss the hybrid signed digit representations that offer a
continuum of choices from two's complement representation on one
extreme, all the way to a fully signed digit representation on the
other extreme. Power and time delay reductions are achieved as a
result of algorithmic level changes. Our analysis using TSMC 1.8\mum
technology indicates that the increment in power over the whole range
from two's complement to fully signed representation is relatively
small (52.174\%), while the reduction in speed is much larger
(95.455\%). The best designs from the energy and energy-delay product
points of view are the most redundant.  We also present a new Modified
Hybrid Signed Digit(MHSD) adder that leads to greater
improvements. Compared to the Hybrid Signed Digit(HSD) adder, MHSD
adder shows power decrement of 1.653\% and speed increment of
17.716\%.",
)


@article(ameer:2007:paf,
author = "Ameer M. Sherif and Hossam A. H. Fahmy",
title  = "{Parameterized Arabic font development for AlQalam}",
journal= "TUGboat",
note =   "Appeared originally in {EuroBacho{\TeX}} 2007: the $17^{th}$ 
Annual
             Meeting of the European {\TeX} Users,
             {Bachotek}, {Poland}",
year   = "2008",
month  = jan,
volume = "29",
number = "1",
pages   = "79--88",
ISBN   = "",
ps     = "publish/paf.ps.gz",
pdf    = "publish/paf.pdf",
url    = "http://www.tug.org/TUGboat/Contents/contents29-1.html",
comment = "http://www.gust.org.pl/gust/BachoTeX/EuroBachoTeX2007/",
abstract ="We present new approaches to Arabic font development for 
AlQalam system. In order to achieve an output quality close to that of 
Arabic calligraphers, we try to
model the pen nib and the way it is used to draw curves as closely as 
possible using \MF. Parameterized fonts are also introduced for a more 
flexible and dynamic combination of glyphs, to be used in forming 
ligatures and in drawing whole words as single entities. Quality will 
improve if words are created as single entities since the Arabic script 
is cursive. We compare our method to the basic binding of glyphs using 
simple box and glue mechanisms and also to currently existing font 
design technologies.",
)


@InProceedings(essawi:2007:cntfet,
author = "Amr A. Essawi and Hossam A. H. Fahmy and Nadia H. Raafat",
title  = "{Characterization of a coaxial mid-gap SB CNTFET inverter}",
booktitle =  "{IMNC}, 20th {I}nternational {M}icroprocesses and 
{N}anotechnology {C}onference, {K}yoto, {J}apan",
year   = "2007",
month  = nov,
pages   = "",
ISBN   = "",
ps     = "publish/cntfet.ps.gz",
pdf    = "publish/cntfet.pdf",
abstract = " Many research groups attempt to extend Moore's law for
digital circuits beyond the expected end of the CMOS scaling by
proposing alternate devices. Carbon NanoTube FETs, CNTFETs, are among
the most promising devices. In this paper, we investigate the
performance of digital inverter gates based on mid-gap Schottky
Barrier CNTFET with coaxial structure. This structure is the most
suitable CNT structure for future 3D integration.  ",
)


@article(ameer:2008:mdaf,
author = "Ameer M. Sherif and Hossam A. H. Fahmy",
title  = "{Meta-designing parameterized Arabic fonts for AlQalam}",
journal= "TUGboat",
note =   "Appeared originally in {TUG}~2008: The Annual Meeting of the International {\TeX} Users 
               Group, {Cork}, {Ireland}",
year   = "2008",
month  = nov,
volume = "29",
number = "3",
pages   = "435--443",
ISBN   = "",
ps     = "publish/mdaf.ps.gz",
pdf    = "publish/mdaf.pdf",
url    = "http://www.tug.org/TUGboat/Contents/contents29-3.html",
abstract ="In this paper we discuss how parameterized Arabic letters
  are meta-designed using \MF\ and then used to form words.
  Parameterized Arabic fonts enable greater flexibility in joining
  glyphs together and rendering words with imperceptible junctions and
  smoother letter extensions. This work aims to produce written
  Arabic with quality close to that of calligraphers. Words produced
  using our parameterized font are compared to other widely used fonts
  in a subjective test and results are presented.",
)


@InProceedings(fahmy:2008:corr, 
author = "Hossam A. H. Fahmy and Ayman Elezabi", 
title = "Bipolar sequences correlator and squarer for multiple-access systems",
bookTitle= "Forty-Second {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2008",
month  = oct,
volume = "",
pages  = "1837--1839",
ps     = "publish/corr2008.ps.gz",
pdf    = "publish/corr2008.pdf",
abstract="  Significant performance gains are achieved in code-division multiple-access communication systems by including knowledge of user cross-correlations in the channel decoder metrics. Within these
  calculations, the sums of squares of cross-correlations between the
  bipolar spreading sequences appear frequently. In this contribution
  we present the hardware design of a correlator and a squarer for
  such bipolar sequences. Our focus is to explore the potential
  savings in the amount of hardware needed while maintaining a high
  speed design. We prove the mathematical concepts leading to such
  savings and describe our methodology to test the resulting designs
  for functionality. Both designs pass the tests.
",
)


@InProceedings(ramy:2008:decmul,
author = "Ramy Raafat and Amira Mohamed and Rodina Samy and Tarek ElDeeb and Yasmin Farouk and Mostafa Elkhouly and Hossam A. H. Fahmy",
title  = "A Decimal Fully Parallel and Pipelined Floating
               Point Multiplier",
bookTitle= "Forty-Second {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2008",
month  = oct,
volume = "",
pages  = "",
ps     = "publish/decmul.ps.gz",
pdf    = "publish/decmul.pdf",
abstract="Decimal floating-point multiplication is
important in many commercial applications including
financial analysis, banking, tax calculation, currency
conversion, insurance, and accounting. This paper presents
a fully parallel Decimal64 floating point multiplier
compliant to IEEE Std 754-2008 standard for floating point
arithmetic. The proposed multiplier possesses novel
methods to target low latency. The proposed design is based
on previously published fixed point multiplier that uses a
novel BCD~4221 recoding for decimal digits to improve the
area and latency of the partial product generation and the
partial product reduction tree. Several enhancements are
introduced to the design; the final carry propagation adder
is implemented using a full parallel decimal adder with a
Kogge-Stone prefix tree, the sticky bit is generated parallel
to the shifter to reduce the critical path delay. The design is
extendible to support Decimal128 floating point
multiplication. The multiplier is hardware verified for
functionality on an FPGA.",
)


@InProceedings(mostafa:2008:pow,
author = "Mostafa E. A. Ibrahim and Markus Rupp and Hossam A. H. Fahmy",
title  = "Power Estimation Methodology for {VLIW} Digital Signal Processor",
bookTitle= "Forty-Second {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2008",
month  = oct,
volume = "",
pages  = "",
ps     = "publish/pow2008.ps.gz",
pdf    = "publish/pow2008.pdf",
abstract="In this contribution the modeling of power consumption
for the VLIW processor TMS320C6416T is presented
taking into account typical software algorithms in signal and
image processing. The modeling is performed at the functional
level making this approach distinctly different from other modeling
approaches in low level technique. This means that the
power consumption can be identified at an early stage in the
design process, enabling the designer to explore different hardware
architectures and software algorithms. Some typical signal
and image processing algorithms are used for the purpose of
validating the proposed model. The estimated power consumption
is compared to the physically measured power consumption,
achieving a very low resulting average estimation error of $1.05\%$
and a maximum estimation error of only $3.3\%$
",
)


@InProceedings(amr:2008:turbo,
author = "Amr M. A. Hussein and Hossam A. H. Fahmy and Mohamed M. Khairy",
title  = "Efficient Hardware Implementation for 802.16e double binary {T}urbo decoder",
bookTitle= "The International Conference on Microelectronics, 
            {ICM} 2008, {U}nited {A}rab {E}mirates",
year   = "2008",
month  = dec,
volume = "",
pages  = "",
ps     = "publish/turbo2008.ps.gz",
pdf    = "publish/turbo2008.pdf",
abstract="In this paper, a hardware implementation of 802.16e-2005
Turbo encoder and decoder is presented with an efficient interleaver
implementation and normalization scheme. The normalization scheme is
based on rescaling, which results in area and memory reduction and
speed enhancement. It is shown that this normalization technique saves
up to 12\% of the required storage in addition to saving hardware
resources needed for the decoding operation. A speed efficient
implementation for this normalization which reduces critical path
delay up to 16.5\% using redundant number system is proposed.",
)


@InProceedings(amr:2008:drus,
author = "Amr M. S. Tosson and Hossam A. H. Fahmy and Mohamed F. Abu El-Yazeed",
title  = "{DRUS}: A new proposed interoperable {DRM} hardware-software solution",
bookTitle= "The 4th International Computer Engineering Conference, Giza, Egypt",
year   = "2008",
month  = dec,
volume = "",
pages  = "",
ps     = "publish/drus2008.ps.gz",
pdf    = "publish/drus2008.pdf",
abstract="The advent of consumer digital media products has vastly
increased the concerns of copyright-dependent organizations within the
music and movie industries. This has led to the emergence of the
Digital Rights Management (DRM) field which provides solutions to
prevent unauthorized access to digital content in general. Many DRM
solutions were implemented and suggested but they mainly suffer from
interoperability issue. In this work, we propose a new DRM system and
discuss its interoperability advantage over today's DRM solutions.
",
)

@InProceedings(harb:2009:evi,
author = "Mohamed M. Harb and Hossam A. H. Fahmy",
title  = "Deploying Electronic Vehicle Identification {(EVI)} System in 
Developing Countries",
bookTitle= "The 6th International Workshop on Intelligent 
Transportation, Hamburg, Germany",
year   = "2009",
month  = mar,
volume = "",
pages  = "181--184",
ps     = "publish/evi2009.ps.gz",
pdf    = "publish/evi2009.pdf",
abstract=" This paper presents the constraints on deploying an 
Electronic Vehicle Identification (EVI) system and especially in 
developing countries like Egypt. We analyze and present a system to 
fulfill these requirements for two applications: speed limitation and 
electronic toll collection. Our system is flexible and can be easily 
modified to suit other future applications. 
", 
)

@InProceedings(ahmedin:2009:wimax,
author = "Ahmed Ahmedin and and S. Rashad and M. Fayez and M. A. Raouf 
          and M. Sayed and Hossam A. H. Fahmy and Ahmed K. Sultan 
          and M. Hamed",
title  = "A simplification in integral frequency offset estimation based 
          on joint detection algorithm for {WiMAX} 802.16e",
bookTitle= "{N}ational {R}adio {S}cience {C}onference, {C}airo, {E}gypt",
year   = "2009",
month  = mar,
volume = "",
pages  = "",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "",
abstract=" 
Initial downlink synchronization for orthogonal frequency division 
multiple access (OFDMA) network access involves timing and frequency 
synchronization. The frequency offset is produced by oscillator drifts 
and time-varying Doppler shifts. In mobile WiMAX 802.16e carrier 
frequency offset (CFO) can be divided into: integral carrier frequency 
offset (ICFO) and fractional carrier frequency offset (FCFO). There are 
mainly three methods for CFO estimation: data-aided method, blind and 
semi-blind. This paper is based on the semi-blind method presented in 
ldquoJoint detection of integral carrier frequency offset and preamble 
index in OFDMA WiMAX downlink synchronizationrdquo, IEEE, 2007, see [2]. 
We simplify the algorithm presented in that paper by (a) using an 
adder-subtractor instead of using squares to estimate power and (b) by 
using a XNOR instead of complex multiplier; thereby reducing hardware 
gates by a factor of 676. Simulation results show only a slight 
degradation in performance with a considerable reduction in complexity.
", 
)

@InProceedings(sawaf:2009:therm,
author = "Mohamed A. ElSawaf and Hossam A. H. Fahmy and Abdel-Latif 
ElShafei",
title  = "{CPU} Dynamic Thermal Management via Thermal Spare Cores",
bookTitle= "The 25th {IEEE} {S}emiconductor {T}hermal {M}easurement and 
{M}anagement {S}ymposium, {S}an {J}ose, {CA}, {USA}",
year   = "2009",
month  = mar,
volume = "",
pages  = "139--145",
ISBN   = "978-1-4244-3664-4",
ps     = "publish/therm2009.ps.gz",
pdf    = "publish/therm2009.pdf",
note   = "DOI: 10.1109/STHERM.2009.4810755",
abstract=" 
Adding cores to CPU chip increases its power density and leads to 
thermal throttling due to cooling limitations. Thermal spare cores (TSC) 
is proposed as new technique for dynamic thermal management (DTM). Our 
objective is to avoid thermal throttling and ensure stable CPU 
performance. Towards this objective, thermal model of IBM Power 4 CPU 
chip contains 8 cores implemented as proof of concept. TSC higher 
potential expected with CUP chip having higher number of cores under 
thermal constraints. In the near future we will be able to add dozens of 
cores to CUP chip; while we will not be able to activate them all 
simultaneously due to air cooling limitations and thermal throttling.
", 
)

@InProceedings(fahmy:2009:energyDFP,
author = "Hossam A. H. Fahmy and Ramy Raafat and Amira M. Abdel-Majeed and Rodina Samy and Tarek ElDeeb and Yasmin Farouk",
title  = "Energy and Delay Improvement via Decimal Floating Point Units",
bookTitle= "Proceedings of the 19th {IEEE} Symposium on Computer
            Arithmetic, Portland, Oregon, {USA}",
year   = "2009",
month  = jun,
pages = "221--224",
ps     = "publish/arith2009.ps.gz",
pdf    = "publish/arith2009.pdf",
abstract=" Interest in decimal arithmetic increased considerably in 
recent
  years. This paper presents new designs for decimal floating point
  (DFP) addition, multiplication, fused multiply-add, division, and
  square root. It stresses the importance of energy savings achieved
  by hardware implementations of the IEEE standard for decimal
  floating point. To the best of the authors knowledge, this is the
  first work to discuss energy savings in DFP and the first to present
  a hardware implementation of a fused multiply-add. Our
  Newton-Raphson based divider is over three times faster than the
  similar design previously reported.
",
)


@InProceedings(sallab:2009:DSR,
author = "Ahmad A. Al-Sallab and Hossam A. H. Fahmy and Mohsen Rashwan",
title  = "HARDWARE IMPLEMENTATION OF DISTRIBUTED SPEECH RECOGNITION
                   SYSTEM FRONT END",
booktitle =  "{EUSIPCO}, 17th {E}uropean {S}ignal {P}rocessing {C}onference, {G}lasgow, {S}cotland",
year   = "2009",
month  = aug,
pages   = "953--957",
ISBN   = "",
ps     = "publish/dsr.ps.gz",
pdf    = "publish/dsr.pdf",
abstract = "
Modern speech recognition applications are heading towards embedded
systems and hand-held devices. Distributed Speech Recognition (DSR)
system architecture emerged to address this kind of applications. Most
of the existing implementations of this system are presented in
software fashion, with little consideration to the end product
platform in which the system will be deployed. In this paper, an
optimized hardware implementation of the front end part of the DSR
specified in the basic ETSI Aurora standard ETSI ES 201 108 is
presented in FPGA platform prototype, with consideration of migration
to structured ASIC in case of mass-production. Main design issues and
tips are highlighted.  Results are presented in terms of hardware
resources utilization, comparison of some basic system components to
third party reference designs and compliance to the Aurora standard.
",
)


@InProceedings(sallab:2009:fft,
author = "Ahmad A. Al-Sallab and Hossam A. H. Fahmy and Mohsen Rashwan",
title  = "Optimized hardware implementation of {FFT} processor",
booktitle =  "The 4th {I}nternational {D}esign and {T}est Workshop ({IDT}), 
{R}iyadh, {S}audi {A}rabia",
year   = "2009",
month  = nov,
pages   = "",
ISBN   = "978-1-4244-5748-9",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/IDT.2009.5404139",
abstract = "
Fast Fourier transform (FFT) is an essential component in many digital 
signal processing and communications systems. The performance of the FFT 
component is a key factor in evaluating the overall system performance, 
and it is common to use it as a benchmark for the whole system. Many 
attempts have been made to enhance the FFT performance, both on 
algorithm and implementation levels. Software and hardware designs exist 
to implement this component. In this paper, an optimized hardware 
implementation of FFT processor on FPGA is presented, where the steps of 
radix-2 FFT algorithm are well analyzed and an optimized design is 
developed as a result, with full exploitation of the hardware platform 
capabilities to achieve optimum performance. The performance results of 
the proposed design are demonstrated, and compared to other related 
works and reference designs.
",
)


@InProceedings(wazeer:2009:chest,
author = "Khaled ElWazeer and Mohamed M. Khairy and Hossam A. H. Fahmy and Serag E.-D.Habib",
title  = "{FPGA} Implementation of an Improved Channel Estimation Algorithm for Mobile {WiMAX}",
bookTitle= "The International Conference on Microelectronics, 
            {ICM} 2009, {M}orocco",
year   = "2009",
month  = dec,
ps     = "publish/icm2009.ps.gz",
pdf    = "publish/icm2009.pdf",
abstract="In this paper, we present an FPGA implementation
of an improved channel estimation technique for the OFDM
based WiMAX systems. The proposed technique is a modified
version of the well-known MMSE technique. The implementation
minimizes the complexity of the accurate MMSE based channel
estimator by making use of sets of previously calculated filter
coefficients in estimation. The hardware architecture presented
has a major advantage which is its ability to adapt itself to cope
with different wireless standards not only WiMAX.
",
)


@InProceedings(mostafa:2009:trans,
author = "Mostafa E. A. Ibrahim and Markus Rupp and Hossam A. H. Fahmy",
title  = "Code Transformations and {SIMD} Impact on Embedded Software Energy/Power Consumption",
bookTitle= "{ICCES09}, International Conference on Computer Engineering and Systems, {C}airo, {E}gypt",
year   = "2009",
month  = dec,
volume = "",
pages  = "",
ps     = "publish/trans2009.ps.gz",
pdf    = "publish/trans2009.pdf",
abstract=" The increasing demand for portable computing has
elevated power consumption to be one of the most critical
embedded systems design parameters. In this paper, we present
a qualitative study wherein we examine the impact of code
transformations on the energy and power consumption. Three
main categories of code transformations are investigated, namely
data, loop and procedural oriented transformations. Moreover,
we evaluate the influence of employing Single Instruction 
Multiple Data (SIMD) on energy and power dissipation via the
utilization of compiler intrinsic C-functions. Results show that
a trade-off between power and performance can be achieved
by employing the intrinsic C-functions in conjunction with
some transformations such as loop unrolling and procedure
integration.
",
)


@InProceedings(rodina:2010:decfma,
author = "Rodina Samy and Hossam A. H. Fahmy and Ramy Raafat and Amira 
Mohamed and Tarek ElDeeb and Yasmin Farouk",
title  = "A Decimal Floating-Point Fused-Multiply-Add Unit",
bookTitle= "Fifty-Third {M}id{W}est {S}ymposium on {C}ircuits and 
{S}ystems, {(MWSCAS)}, {S}eattle, {W}ashington, 
{USA}",
year   = "2010",
month  = aug,
volume = "",
pages  = "",
ps     = "publish/decfma.ps.gz",
pdf    = "publish/decfma.pdf",
abstract="This    paper    presents    the    first   hardware
implementation of a fully parallel decimal floating-point fused-
multiply-add unit performing the operation $\pm (A \times B) \pm C$ on
decimal floating-point operands. The proposed design is fully
compliant with the IEEE 754-2008 standard and supports the
two standard formats decimal64 and decimal128. Furthermore,
the proposed design may be controlled to perform the
multiplication or the addition/subtraction as standalone
operations. Our decimal floating-point FMA may be pipelined
so that a complete resultant decimal floating-point is available
each clock cycle.",
)


@InProceedings(hamed:2010:mcrsd,
author = "Hamed Salah and Hazem Ahmed and Tallal ElShabrawy and Hossam A. H. Fahmy", 
title  = "Low-Energy Configurable Syndrome/Chien Search Multi-Channel 
          {R}eed {S}olomon Decoder",
bookTitle= "The 23rd {IEEE} International System On Chip Conference, {L}as {V}egas, {N}evada, USA",
year   = "2010",
month  = sep,
volume = "",
pages  = "",
ps     = "",
pdf    = "publish/mcrsd_socc2010.pdf",
abstract=" 
", 
)


@InProceedings(hazem:2010:ilBM, 
author = "Hazem A. Ahmed and Hamed Salah and Tallal ElShabrawy and Hossam A. H. Fahmy", 
title = "A Low Energy High Speed {R}eed-{S}olomon Decoder Using 
Decomposed Inversionless {B}erlekamp-{M}assey Algorithm",
bookTitle= "Forty-Fourth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2010",
month  = nov,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="",
)

@InProceedings(Fattah:2010:EVD, 
author = "Amr Sayed-Ahmed and Hossam A. H. Fahmy and Mahmoud Hassan", 
title = "Three Engines to Solve Verification Constraints of Decimal Floating-Point Operations",
bookTitle= "Forty-Fourth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2010",
month  = nov,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="",
)

@InProceedings(karim:2010:RDadd, 
author = "Karim Yehia and Hossam A. H. Fahmy and Mahmoud Hassan", 
title = "A Redundant Decimal Floating-Point Adder",
bookTitle= "Forty-Fourth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2010",
month  = nov,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="",
)

@InProceedings(mahmoud:2010:pow, 
author = "Mahmoud Hassan and Tarek ElDeeb and Hossam A. H. Fahmy", 
title = "Algorithm and Architecture for On-Line Decimal Powering Computation",
bookTitle= "Forty-Fourth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2010",
month  = nov,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="",
)

@InProceedings(fahmy:2010:DFPprocessors, 
author = "Hossam A. H. Fahmy and Tarek ElDeeb and Mahmoud Hassan and 
Yasmin Farouk and Ramy Eissa", 
title = "Decimal Floating Point For Future Processors",
bookTitle= "The 22nd International Conference on Microelectronics,
            {ICM} 2010, {E}gypt",
year   = "2010",
month  = dec,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="",
)


@article(mostafa:2011:model,
author = "Mostafa E. A. Ibrahim and Markus Rupp and Hossam A. H. Fahmy",
title  = "{A Precise High-Level Power Consumption Model for Embedded Systems Software}",
journal= "{EURASIP} Journal on Embedded Systems",
year   = "2011",
month  = "",
volume = "2011",
number = "",
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
url    = "http://jes.eurasipjournals.com/content/2011/1/480805",
note   = "Article ID 480805, doi:10.1155/2011/480805",
abstract ="The increasing demand for portable computing has elevated 
power consumption to be one of the most critical embedded systems 
design parameters. In this paper, we present a precise high-level 
power estimation methodology for the software loaded on a VLIW 
processor that is based on a functional level power model. The 
targeted processor of our approach is the TMS320C6416T DSP from Texas 
Instrument. We consider several important issues in our model such as 
the pipeline
stall, inter-instructions effect and cache misses. The contributions 
are the following. First, a precise model to estimate the power 
consumption
of the targeted DSP, while running a software algorithm is proposed. 
Second, we prove the validation and precision of our
model on many typical algorithms applied in signal and image 
processing. Third, we further validate the precision of our model
on a real application applied in the video processing field. The power 
consumption estimated by our model is compared to the
physically measured power consumption, achieving a very low average 
absolute estimation error of 1.65\% and a maximum
absolute estimation error of only 3.3\%. ",
)


@InProceedings(Fattah:2011:VDFMA, 
author = "Amr Sayed-Ahmed and Hossam A. H. Fahmy and Rodina Samy", 
title = "Verification  of Decimal Floating-Point Fused-Multiply-Add Operation",
bookTitle= "The Ninth {ACS/IEEE} International Conference on Computer 
Systems and Applications, ({AICCSA}), Sharm El-Sheikh, Egypt",
year   = "2011",
month  = dec,
volume = "",
pages  = "",
ps     = "",
pdf    = "publish/aiccsa2011.pdf",
abstract="Decimal floating-point fused-multiply-add (FMA)
software or hardware designs require a verification process to
prove that the design is in compliance with the IEEE Standard for
Floating-Point Arithmetic (IEEE Std 754-2008). Our work
represents the first verification technique to verify the decimal
FMA designs using simulation based coverage models. The paper
describes in details the coverage models needed in the verification
of the decimal FMA, the FMA engine used to solve the coverage
models, and the results of using that technique in the verification of
SilMinds FMA hardware design, DecNumber FMA software
design, and Intel-Decimal-Library FMA software design. The
Technique has proven its efficiency in discovering bugs in FMA
software and hardware designs.",
)


@InProceedings(mahmoud:2011:asip,
author = "Mahmoud Abdelall and Ahmed F. Shalash and Hossam A. H. Fahmy",
title  = "A reconfigurable baseband processor for wireless {OFDM} 
          synchronization sub-system",
bookTitle= "{The IEEE International Symposium on Circuits and Systems, 
(ISCAS), Rio de~Janeiro, Brazil}",
year   = "2011",
month  = may,
pages   = "2385--2388",
ISBN   = "978-1-4244-9473-6",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ISCAS.2011.5938083",
abstract = "
In this paper, an Application Specific Instruction-set Processor (ASIP) 
architecture to perform all OFDM synchronization tasks is proposed. 
While applicable to many OFDM systems, the proposed architecture is 
tested on Long Term Evolution (LTE Rel. 8) and WiMAX 802.16e systems. 
The synchronization tasks include, but not limited to symbol timing, 
fine carrier frequency offset (CFO) estimation, coarse CFO estimation, 
cell search, residual CFO estimation and sampling clock frequency offset 
estimation. The engine is scalable and runs at 120~MHz with a total gate 
count of 118k and control overhead less than 10\% of total processing 
cycles. The results of software simulations as well as the results of 
verilog synthesis are presented.
",
)

@InProceedings(eid:2011:memTurbo,
author = "Eid M. Abdel-Hamid and Hossam A. H. Fahmy and Mohamed M. 
Khairy and Ahmed F. Shalash",
title  = "Memory Conflict Analysis For A Multi-standard,
       Reconfigurable Turbo Decoder",
bookTitle= "{The IEEE International Symposium on Circuits and Systems, 
(ISCAS), Rio de~Janeiro, Brazil}",
year   = "2011",
month  = may,
pages   = "2701--2704",
ISBN   = "978-1-4244-9473-6",
ps     = "publish/iscas2011.ps.gz",
pdf    = "publish/iscas2011.pdf",
note   = "DOI: 10.1109/ISCAS.2011.5938162",
abstract = "This paper presents an efficient architecture to 
handle memory conflicts of a unified turbo decoder which supports
multiple standards like HSPA+, 3GPP-LTE, WiMAX, 3GPP2-CDMA2000 
and CCSDS. A unified radix-4 turbo decoder is used
as the reconfigurable unit which provides double throughput
for some standards. A complete memory conflict analysis for
different interleaver patterns has been performed and shows the
effect of using radix-4 decoding on the memory conflicts for
different standards. Such a conflict adds latency and reduces
the throughput significantly. A simple controller is designed to
manage the conflicts on the fly. The proposed design has a
maximum throughput of 283.104Mbps.",
)

@InProceedings(mervat:2011:bindecmul,
author = "Mervat Mahmoud and Hossam A. H. Fahmy",
title  = "A Parallel Combined Binary/Decimal Fixed-Point Multiplier
        with Binary Partial Products Reduction Tree",
bookTitle= "{The 21st International Conference on Computer Theory and Applications (ICCTA), Alexandria, Egypt}",
year   = "2011",
month  = oct,
pages   = "",
ISBN   = "",
ps     = "publish/bindecmul2011.ps.gz",
pdf    = "publish/bindecmul2011.pdf",
abstract = "Combined binary/decimal arithmetic has
become an important topic to support decimal and binary
applications with high speed and low area. This paper
presents a combined binary/decimal fixed-point multiplier
design. Since the partial products accumulation stage has the
largest area and delay of the multiplier, it is the most
significant stage. A novel binary column tree is shared for
binary and decimal reduction tree. A comparison between
the proposed design and the previously published designs
shows a significant decrease in area with almost the same
delay as the fastest known design.",
)

@InProceedings(mohamed:2011:lza,
author = "Mohamed Hosny Amin and Ahmed Mohamed ElTantawy and Hossam A. H. Fahmy and Alhassan Khedr",
title  = "Efficient Decimal Leading Zero Anticipator Designs",
bookTitle= "Forty-Fifth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2011",
month  = nov,
pages   = "",
ISBN   = "",
ps     = "publish/lza2011.ps.gz",
pdf    = "publish/lza2011.pdf",
abstract = "",
)


@InProceedings(amir:2012:noc,
author = "Amir H. M. Zaytoun and Hossam A. H. Fahmy and Khaled M. F. Elsayed",
title  = "Implementation and Evaluation of Large Interconnection Routers 
          for Future Many-Core Networks on Chip",
bookTitle= "The 14th {IEEE} International Conference on High Performance 
            Computing and Communications, {L}iverpool, {UK}",
year   = "2012",
month  = jun,
pages   = "",
ISBN   = "",
ps     = "publish/noc2012.ps.gz",
pdf    = "publish/noc2012.pdf",
abstract = "As the number of processing elements in the future
Networks on Chip (NoC) increases from multi-cores to many-cores, 
the role of the interconnection communications becomes
more critical. The number of cores on a System on Chip (SoC)
will reach thousands in the near future as predicted by the
International Technology Roadmap for Semiconductors
(ITRS). Currently, NoC interconnections are mostly
implemented with m×n 2-D mesh topology connecting small
size routers. This will represent the bottleneck to the
communication latency for the increasing number of cores
where the average number of hops the data have to pass will
increase. In this paper, we propose an alternative NoC
interconnecting scheme by using large routers interconnecting
large number of cores in star topology. This interconnection
scheme can be scaled up by using hierarchical-star or fat-tree
topologies. We present the implementation and performance
evaluation of three large router architectures and compare
their efficiency to the small 5×5 router used in the mesh
topology. We develop a simulating environment that resembles
the real NoC conditions to test the routers throughput and
average latency on different buffer sizes and under different
traffic loads. We also synthesize them to estimate the area and
power consumption. Then, routers efficiencies are calculated
with respect to the area and power consumption.
",
)

@InProceedings(walid:2012:mrs,
author = "Walid El-Reedy and Ali A. El-Moursy and Hossam A. H. Fahmy",
title  = "High Performance Memory Requests Scheduling Technique for 
          Multicore Processors",
bookTitle= "The 14th {IEEE} International Conference on High Performance 
            Computing and Communications, {L}iverpool, {UK}",
year   = "2012",
month  = jun,
pages   = "",
ISBN   = "",
ps     = "publish/mem_schd.ps.gz",
pdf    = "publish/mem_schd.pdf",
abstract = "   In modern computer systems, long memory latency 
is one of the main bottlenecks micro-architects are facing
for leveraging the system performance especially for memory-intensive 
applications. This emphasises the importance of
the memory access scheduling to efficiently utilize memory
bandwidth. Moreover, in recent micro-processors, multithread
and multicore is turned to be the default choice for their
design. This resulted in more contention on memory. Hence,
the effect of memory access scheduling schemes is more
critical to the overall performance boost. Although memory
access scheduling techniques have been recently proposed for
performance improvement, most of them have overlooked the
fairness among the running applications. Achieving both 
high-throughput and fairness simultaneously is challenging.
   In this paper, we focus on the basic idea of memory
requests scheduling, which includes how to assign priorities
to threads, what request should be served first, and how to
achieve fairness among the running applications for multi-core 
microprocessors. We propose two new memory access
scheduling techniques FLRMR, and FIQMR. Compared to
recently proposed techniques, on average, FLRMR achieves
8.64\% speedup relative to LREQ algorithm, and FIQMR
achieves 11.34\% speedup relative to IQ-based algorithm.
FLRMR outperforms the best of the other techniques by 8.1%
in 8-cores workloads. Moreover, FLRMR improves fairness
over LREQ by 77.2\% on average.
",
)

@InProceedings(shehab:2012:eccdb,
author = "Shehab Y. Elsayed and Hossam A.  H. Fahmy and Muhammad S. Khairy",
title  = "Residue Codes for Error Correction in a Combined Decimal/Binary Redundant Floating Point Adder",
bookTitle= "Forty-Sixth {A}silomar Conference on Signals, Systems, and
          Computers, {A}silomar, {C}alifornia, {USA}",
year   = "2012",
month  = nov,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "As fault rates increase when technology advances from one node to another, fault tolerance becomes vital for the reliability of
arithmetic circuits. This work represents an attempt to achieve fault tolerance for a combined IEEE decimal-64/binary-64 floating
point redundant adder by using residue codes. To our knowledge, this is the first implementation of a residue error correction
scheme in decimal and binary arithmetic circuits. The proposed circuit has the ability of all-digit error correction assuming that
errors occur only in the main adder.",
)

@InProceedings(ahmed:2012:lcap,
author = "Ahmed Hazem and Hossam A. H. Fahmy",
title  = "{LCAP} - {A} {L}ightweight {CAN} {A}uthentication {P}rotocol for Securing In-Vehicle Networks",
bookTitle= "10th escar Embedded Security in Cars Conference, {B}erlin, {G}ermany",
year   = "2012",
month  = nov,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "",
)

@InProceedings(hazem:2012:eviBM,
author = "Hazem A. Ahmed and Hamed Salah and Tallal Elshabrawy and Hossam A. H. Fahmy",
title  = "Low Energy High Speed {R}eed-{S}olomon Decoder Using Two parallel 
          Modified Evaluator Inversionless {B}erlekamp-{M}assey",
bookTitle= "19th {IEEE} International Conference on Electronics, Circuits, and Systems, {S}eville, {S}pain",
year   = "2012",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "publish/BM_2p_mod_ev.pdf",
abstract = "This paper proposes a low power high throughput Reed
Solomon decoder designed optimally for handheld devices under
the DVB-H standard. This architecture based on Decomposed
Inversionless Berlekamp-Massey Algorithm (DiBM), where the
error locator and evaluator polynomial can be computed serially.
In the proposed architecture, a new scheduling of 6~Finite
Field Multipliers (FFMs) is used to calculate the error locator
polynomial in a two parallel way and these multipliers are
reused to calculate the error evaluator polynomial in a novel
architecture called two parallel modified evaluator decomposed
inversionless Berlekamp-Massey (MEDiBM) to achieve low energy.
This architecture is tested in a pipelined two parallel decoder. This
decoder has been implemented by $0.13 \mu$m CMOS IBM standard
cells for RS(204, 188) and gave gate count of 33 K and area
of 1.06~mm$^2$ . Simulation results show this approach can work
successfully at the data rate 100~Mbps with power dissipation of
0.266~mW.",
)

@article(sherif:2012:experience,
author = "Sherif S. Mansour and Hossam A. H. Fahmy",
title  = "Experiences with {A}rabic font development",
journal= "TUGboat",
year   = "2012",
month  = dec,
volume = "33",
number = "3",
pages   = "295--299",
ISBN   = "",
ps     = "",
pdf    = "",
url    = "",
note   = "",
abstract ="
This is a report of our experiences attempting to use a new font, AlQalam,
for the Arabic script 
within \TeX. Then we want to make use of the new features introduced in Lua\TeX\ to build our context analysis and line breaking 
engines to achieve a complete functional font package.
We describe the challenges of producing high-quality Arabic fonts in
general and what AlQalam has introduced 
to meet Arabic script requirements.
We also describe the problems we faced trying to figure out how to use a new right-to-left font within \TeX , 
what approaches we used to debug the font and some debugging results.
This remains work in progress.
",
)


@article(zidan:2013:memristor,
author = "Mohammed Affan Zidan and Hossam Aly Hassan Fahmy and Muhammad Mustafa Hussain and Khaled Nabil Salama",
title  = "Memristor-based Memory: The Sneak Paths Problem and Solutions",
journal= "Microelectronics Journal",
year   = "2013",
month  = feb,
volume = "44",
number = "2",
pages  = "176--183",
ISSN   = "0026-2692",
ps     = "",
pdf    = "",
mynote   = "Received 13 March 2012, 
    Revised 30 September 2012,
    Accepted 2 October 2012,
    Available online 26 October 2012",
note   = "DOI: 10.1016/j.mejo.2012.10.001",
url    = "http://dx.doi.org/10.1016/j.mejo.2012.10.001",
abstract ="In this paper, we investigate the read operation of memristor-based memories. We analyze the sneak paths problem and provide a
noise margin metric to compare the various solutions proposed in the literature. We also analyze the power consumption associated
with these solutions. Moreover, we study the effect of the aspect ratio of the memory array on the sneak paths. Finally, we introduce
a new technique for solving the sneak paths problem by gating the memory cell using a three-terminal memistor device.
",
)

@InProceedings(mtarek:2013:bch,
author = "Mohamed T. A. Osman and Hossam A. H. Fahmy and Yasmine A. H. Fahmy and Maha A. Elsabrouty",
title  = "Two Programmable {BCH} Soft Decoders for High
     Rate Codes with Large Word Length",
bookTitle= "{The IEEE International Symposium on Circuits and Systems, 
(ISCAS), Beijing, China}",
year   = "2013",
month  = may,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "In this paper, two BCH soft decoders are proposed
suitable for high rate codes with medium to large word length.
The proposed decoders provide a programmable performance
gain, with a reduced critical path allowing for an increase upto
m/2 times the operating frequency of algebraic decoders, where m
is the Galois field size. Our proposed decoders operate only on
the least reliable bits, which leads to a reduction in the decoder
complexity by removing the Chien search procedure.
",
)


@InProceedings(manar:2013:dvb,
author = "Manar N. H. Shaker and Hisham M. Hamed and Ahmed F. Shalash and Hossam A. H. Fahmy",
title  = "Efficient Implementation Of Time De-Interleaver For {DVB-T2}",
bookTitle= "{ International Conference on Communication, 
            Control and Computer Engineering, (ICCCCE), Istanbul, Turkey}",
year   = "2013",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "In this paper, we propose a memory efficient implementation
of Time De-Interleaver (TDI) for Digital Video Broadcasting-Terrestrial
2nd Generation (DVB-T2). The presented implementation supports all
the modulation types and it is valid for all possible data rates as defined
by the standard. Our proposal satisfies the maximum number of cells
mentioned in the standard with efficient memory. For multiple physical
layer pipes (PLP) our design has less than the minimum excepted memory
size in the standard and for single PLP it has exactly the minimum
excepted memory size. Our design is a viable candidate for the future
T2-Lite standard.
",
)


@article(osman:2014:bch,
author = "Mohamed Tarek A.~Osman and Hossam A.~H.~Fahmy 
          and Yasmine A.~H.~Fahmy and Maha M.~Elsabrouty and Ahmed Shalash",
title  = "Two Extended Programmable {BCH} Soft Decoders Using Least 
Reliable Bits Reprocessing",
journal= "Circuits, Systems and Signal Processing by Springer",
year   = "2014",
month  = may,
volume = "33",
number = "5",
pages  = "1369--1391",
ISSN   = "0278-081X",
ps     = "",
pdf    = "",
note   = "DOI: 10.1007/s00034-013-9709-x",
url    = "http://dx.doi.org/10.1007/s00034-013-9709-x",
abstract ="
This paper proposes two BCH soft decoders suitable for high rate
codes with medium to large word length. The proposed decoders extend the
correcting capability by providing a programmable performance gain 
according to the choice of the extra compensated bits p, with a theoretical 
maximum likelihood decoding when 2t + p approaches the codeword size n, 
where t is the
correcting capability of the code under algebraic decoding. Our proposed 
architectures for the proposed algorithms use pipelined arithmetic units, 
leading
to a reduction in the critical paths. 
This allows for an increase in the operating frequency up to m/2 times 
compared to algebraic decoders, where m is the Galois field size. 
Our proposed decoders operate only on the least reliable bits,
which leads to a reduction in the decoder complexity by removing the Chien
search procedure.
",
)


@article(zidan:2014:closedsneak,
author = " Mohammed Affan Zidan and Ahmed M. Eltawil and Fadi Kurdahi 
           and Hossam A.~H.~Fahmy and Khaled Nabil Salama",
title  = "Memristor Multi-Port Readout: A Closed-Form Solution for Sneak-Paths",
journal= "{IEEE} {T}ransactions on {N}anotechnology ({TNANO})",
personalNote= "{IEEE} {T}ransactions on {N}anotechnology ({TNANO}): Regular Papers",
year   = "2014",
month  = mar,
volume = "13",
number = "2",
pages  = "274--282",
ISSN   = "1536-125X",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/TNANO.2014.2299558",
url    = "http://dx.doi.org/10.1109/TNANO.2014.2299558",
abstract ="
In this paper, we introduce for the first time, a
closed-form solution for the memristor-based memory sneak-paths 
without using any gating elements. The introduced technique 
fully eliminates the effect of sneak-paths, by reading
the stored data using multiple access points and evaluating a
simple addition/subtraction on the different readings. The new
method requires fewer reading steps compared to previously
reported techniques, and has a very small impact on the memory
density. To verify the underlying theory, the proposed system is
simulated using Synopsys HSPICE showing the ability to achieve
a 100\% sneak-path error free memory. In addition, the effect of
quantization bits on the system performance is studied.
",
)


@article(mounir:2014:aging,
author = "Mohamed Mounir Mahmoud and Norhayati Soin and Hossam A.~H.~Fahmy",
title  = "Design Framework to Overcome Aging Degradation of the 16 nm {VLSI} Technology Circuits",
journal= "{IEEE} {T}ransactions on {C}omputer {A}ided {D}esign of {I}ntegrated {C}ircuits and {S}ystems",
year   = "2014",
month  = may,
volume = "33",
number = "5",
pages  = "691--703",
ISSN   = "0278--0070",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/TCAD.2014.2299713",
url    = "http://dx.doi.org/10.1109/TCAD.2014.2299713",
abstract ="Intensive scaling for VLSI circuits is a key factor for
gaining outstanding performance. However, this scaling has huge
negative impact on circuit reliability, as it increases the undesired
effect of aging degradation on ultra-deep submicron technologies.
Nowadays, Bias Temperature Instability (BTI) aging process has
a major negative impact on VLSI circuits reliability. This paper
presents a comprehensive framework that assists in designing
fortified VLSI circuits against BTI aging degradation. The
framework contains: (1) Novel circuit level techniques that
eliminate the effect of BTI, these techniques successfully decrease
the power dissipation by 36\% and enhance the reliability of
VLSI circuits, (2) Evaluation of the reliability of all circuit level
techniques used to eliminate BTI aging degradation for 16~nm
CMOS technology, (3) Comparison between the efficiency of all
circuit level techniques in terms of power consumption and area.
",
)


@InProceedings(amr:2014:dsqrt,
author = "Amr Sayed Ahmed and Hossam A. H. Fahmy and Ulrich Kuehne",
title  = "Verification of the Decimal Floating-Point Square Root Operation",
bookTitle= "19th {IEEE} {E}uropean {T}est {S}ymposium ({ETS}), {P}aderborn, {G}ermany",
year   = "2014",
month  = may,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "Decimal floating-point is a relatively recent addition
to the IEEE standard (IEEE Std 754-2008). There exist few
verification techniques that can check whether software libraries
or hardware designs are in compliance with the standard. Our
work presents a verification method to verify implementations
of the decimal floating-point square root operation. We present
an effective simulation based verification technique using test
cases that verify the corner cases of the operation. The test
cases are generated by solving constraints describing these corner
cases with a dedicated constraint solver. The generated test cases
proved their usefulness by finding severe bugs in two well-tested
designs.",
)


@article(moursy:2014:memsch,
author = "Ali Ali El-Moursy and Walid El-Reedy and Hossam A.~H.~Fahmy",
title  = "Fair Memory Access Scheduling Algorithms for Multicore Processors",
journal= "International Journal of Parallel, Emergent and Distributed Systems",
year   = "2014",
month  = "",
volume = "",
number = "",
pages  = "",
ISSN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1080/17445760.2014.922560",
url    = "http://dx.doi.org/10.1080/17445760.2014.922560",
abstract ="
    In recent processors multicore and multithreaded architectures are 
widely used. This increases the number of contexts running in parallel. 
Contexts are competing for shared resources of the processor. One of 
these critical resources is system main memory. While execution, each 
context sends requests to the main memory to serve the cache misses. 
This introduces the need of memory access schedulers. In this paper, we 
propose two main ideas to be integrated with different existing 
algorithms. These two ideas are: related requests, and starvation time 
threshold for memory requests. We integrated these ideas to three 
existing algorithms to come up with the following new algorithms: The 
Fair Least-Request Most Related (FLRMR) algorithm, the Fair Issue-Queue 
based Most Related (FIQMR) algorithm, and the Fair Ready-to-Inflight 
Ratio Most Related (FRIRMR) based on the Least Request (LREQ) algorithm, 
the Issue Queue based (IQ-based) algorithm, and the Ready-to-Inflight 
Ratio (RIR) algorithm respectively. The results show that FLRMR, FIQMR, 
and FRIRMR improve fairness over LREQ, IQ-based, and RIR algorithms by 
54.6\%, 80.6\%, and 68\% on average respectively. Moreover, compared to 
recently proposed techniques, on average, FLRMR achieves 9.1\% speedup 
relative to LREQ algorithm, FIQMR achieves 17.6\% speedup relative to 
IQ-based algorithm, and FRIRMR achieves 16.8\% over RIR algorithm.
",
)

@InProceedings(zidan:2014:leak,
author  = "Mohammed Affan Zidan and Ahmed S. Salem 
           and Hossam A.~H.~Fahmy and Khaled Nabil Salama",
title = "Leakage analysis of crossbar memristor arrays",
bookTitle= "14th {I}nternational Workshop on Cellular Nanoscale Networks 
and their Applications ({CNNA}), {N}otre {D}ame, {I}ndiana, {USA}",
year   = "2014",
month  = jul,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
note   = "DOI:10.1109/CNNA.2014.6888635",
abstract="
Crossbar memristor arrays provide a promising high density alternative 
for the current memory and storage technologies. These arrays suffer 
from parasitic current components that significantly increase the power 
consumption, and could ruin the readout operation. In this work we study 
the trade-off between the crossbar array density and the power 
consumption required for its readout. Our analysis is based on 
simulating full memristor arrays on a SPICE platform.
",
)


@InProceedings(emara:2014:model,
author  = "Ahmed Adel Emara and Mohamed Aboudina and Hossam A. H. Fahmy",
title = "Corrected and Accurate Verilog-{A} for Linear Dopant Drift 
Model of memristor",
bookTitle= "Fifty-Seventh {M}id{W}est {S}ymposium on {C}ircuits {A}nd 
{S}ystems, {(MWSCAS)}, {C}ollege {S}tation, {T}exas, 
{USA}",
year   = "2014",
month  = aug,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="
There is an urgent need to develop accurate memristor circuit models for 
use in future large designs. Several Verilog-A and Spice models have 
been presented which vary in their accuracy and simulation speed. This 
paper corrects a previous Verilog-A model and enhances the accuracy of 
another model. The results show that our proposal is stable over long 
simulation time, correctly predicts the behavior of circuits, provides a 
better accuracy, and is as fast as previous models. These results make 
our model the best choice for large memory or logic circuits designs 
using memristors.
",
)


@InProceedings(amr:2014:tblmis,
author = "Amr Elhelw and Ali A. El-Moursy and Hossam A. H. Fahmy",
title  = "Time-Based Least Memory Intensive scheduling",
bookTitle= "The 8th {IEEE} International Symposium on Embedded 
            Multicore/Many-core Systems-on-Chip ({MCSoC}-14), 
            {A}izu-{W}akamatsu, {J}apan",
year   = "2014",
month  = sep,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
In the modern chip-multiprocessor system, main
memory is a shared resource among multiple concurrently executing 
threads/applications. The memory scheduling algorithms are
developed to resolve memory contention by arbitrating memory
access in such a way that competing threads progress at a
relatively fast and even pace, resulting in high system throughput
and fairness. This paper presents a new memory scheduling
algorithm called Time-Based Least Memory Intensive scheduling
(TB-LMI) that addresses system throughput and fairness with the
goal of achieving a better throughput and limiting the unfairness.
The main idea is to prioritize threads according to their memory
contentions every pre-defined period of cycles to improve system
throughput and to guarantee fairness.
We evaluate TB-LMI on a variety of multi-programmed
workloads with different queue sizes of memory controllers and
compare its performance to four previously proposed scheduling
algorithms. The TB-LMI achieves the best system throughput and
fairness. On 8-core system, TB-LMI improves system throughput
and fairness on average by 4.22\% and 11.7\% respectively com-
pared to Thread Cluster Memory scheduling (TCM) (previous
work providing the best system throughput and fairness). Our
newly proposed technique adds negligible area and logic overhead
to the Memory Controller compared to the benefits on the system
performance.
",
)


@InProceedings(amin:2014:model,
author  = "Amin Maher and Hossam A. H. Fahmy",
title = "Using range arithmetic in evaluation of compact models",
bookTitle= "16th {GAMM}-{IMACS} {I}nternational {S}ymposium on
                  {S}cientific {C}omputing, {C}omputer {A}rithmetic
                  and {V}alidated {N}umerics ({SCAN}2014),
                  {W}{\"u}rzburg, {G}ermany",
year   = "2014",
month  = sep,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="
Monte-Carlo simulations are usually used to analysis the impact of 
statistical parameters variations in electronic circuit design. As
alternative we evaluate the usage of different range arithmetic models.
A new simulation flow based on interval parameters is introduced to 
integrate or replace Monte-Carlo simulations. Design aspects for interval
simulator are discussed. Interval arithmetic, modal interval arithmetic
and affine arithmetic are used to evaluate semiconductor device models
and to do circuit simulation. Results for linear circuits and non-linear
models are obtained.
",
)


@InProceedings(amr:2014:noc,
author  = "Amr Hany and Magdy A. El-Moursy and Hossam A. H. Fahmy",
title = "Network Of Cores For Large Systems",
bookTitle= "9th {IEEE} {I}nternational {C}onference on {C}omputer {E}ngineering and {S}ystems ({ICCES}), Cairo, Egypt",
year   = "2014",
month  = dec,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="A comparison between SoC with shared bus medium
and NoC using Transaction Level Modeling (TLM) is presented.
The router of the NoC is implemented using SystemC. Different
traffic patterns and loads are used to evaluate the
implementation. Detailed performance evaluation using different
metrics such as throughput, latency, number of hops and power
consumption is provided. It is shown that the throughput of NoC
is higher in addition to its scalability as number of cores in large
systems increases. The rate of throughput increase in NoC is
higher than the rate of increase of power consumption as
compared to SoC as system size increases. SoC could not satisfy
the continuous demands of large systems while NoC is highly
scalable.
",
)


@InProceedings(muhammad:2014:noc,
author  = "Muhammad R. Soliman and Hossam A. H. Fahmy and Serag E.-D. Habib",
title = "{NoC}-based Many-Core Processor Using {CUSPARC} Architecture",
bookTitle= "26th {IEEE} {I}nternational {C}onference on {M}icroelecronics ({ICM}), Doha, Qatar",
year   = "2014",
month  = dec,
volume = "",
pages  = "",
ps     = "",
pdf    = "",
abstract="
This paper introduces CUSPARC-M, a many-core message-passing processor 
based on the Cairo University SPARC processor, CUSPARC, core. CUSPARC-M 
consists of 16 cores arranged in 4x4 mesh architecture. A 
Network-on-Chip (NoC) that incorporates X-Y routing, wormhole switching 
and dynamic virtual channels for flow control provides intra-chip 
communication. The design is synthesized using TSMC 65nm LP kit 
achieving power consumption of 13.68x and area of 17x compared to 
CUSPARC. The NoC consumes only 5.2\% of the total power. Simulating a 
16-block JPEG encoder on 12 cores of CUSPARC-M yielded up to 8.72x 
speedup factor relative to the single-core version.
",
)


@article(zidan:2015:compread,
author = " Mohammed Affan Zidan and Ahmed S. Salem and Hesham Omran 
           and Hossam A.~H.~Fahmy and Khaled Nabil Salama",
title  = "Compensated Readout for High Density {MOS}-Gated Memristor 
          Crossbar Array",
journal= "{IEEE} {T}ransactions on {N}anotechnology ({TNANO})",
personalNote= "{IEEE} {T}ransactions on {N}anotechnology ({TNANO}): Brief Papers",
year   = "2015",
month  = jan,
volume = "14",
number = "1",
pages  = "3--6",
ISSN   = "1536--125X",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/TNANO.2014.2363352",
url    = "http://dx.doi.org/10.1109/TNANO.2014.2363352",
abstract ="Leakage current is one of the main challenges facing 
high-density MOS-gated memristor arrays. In this study, we show that 
leakage current ruins the memory readout process for high-density 
arrays, and analyze the tradeoff between the array density and its power 
consumption. We propose a novel readout technique and its underlying 
circuitry, which is able to compensate for the transistor 
leakage-current effect in the high-density gated memristor array.
",
)


@InProceedings(mohie:2015:finfet,
author = "Mohamed Mohie El-Din and Hassan Mostafa and Hossam A. H. Fahmy 
          and Yehea Ismail and H. Abdelhamid",
title  = "Performance evaluation of {F}in{FET}-based {FPGA} cluster 
          under threshold voltage variation",
bookTitle= "The 13th {IEEE} International New Circuits and Systems Conference 
            ({NEWCAS}), {G}renoble, {F}rance",
year   = "2015",
month  = jun,
pages   = "1--4",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/NEWCAS.2015.7182006",
url    = "http://dx.doi.org/10.1109/NEWCAS.2015.7182006",
abstract = "
The performance of FinFET-based FPGA cluster is evaluated with 
technology scaling for channel length from 20nm down to 7nm showing the 
scaling trends of basic performance metrics. The impact of threshold 
voltage variation, considering die-to-die variations, on the delay, 
power, and power-delay product is reported after the simulation of a 
2-bit adder benchmark. Simulation results show an increasing trend of 
the average power and power-delay product variations with threshold 
voltage as we go down with technology node. On the contrary, the delay 
is showing the least percentage of variations with threshold voltage at 
the most advanced node of 7nm.
",
)

@InProceedings(ahmed:2015:rtmc,
author = "Ahmed Shafik Mohamed and Ali A. El-Moursy and Hossam A. H. Fahmy",
title  = "Real-Time Memory Controller for Embedded  Multi-core System",
bookTitle= "The 17th {IEEE} International Conference on High Performance 
            Computing and Communications, {N}ew {Y}ork, {USA}",
year   = "2015",
month  = aug,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
Nowadays modern chip multi-cores (CMPs) become
more demanding because of their high performance especially
in real-time embedded systems. On the other side, bounded
latencies has become vital to guarantee high performance and
fairness for applications running on CMPs cores. We propose a
new memory controller that prioritizes and assigns defined
quotas for cores within unified epoch (MCES). Our approach
works on variety of generations of double data rate DRAM
(DDR DRAM). MCES is able to achieve an overall
performance reached 35\% for 4 cores system.
",
)


@InProceedings(amr:2015:tblmis,
author = "Amr Elhelw and Ali A. El-Moursy and Hossam A. H. Fahmy",
title  = "Adaptive Time-Based Least Memory Intensive scheduling",
bookTitle= "The 9th {IEEE} International Symposium on Embedded 
Multicore/Many-core Systems-on-Chip ({MCSoC}-15), {T}urin, {I}taly",
year   = "2015",
month  = sep,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
DRAM memory is a major resource shared in
multi-core system, hence memory requests from different 
applications interfere with each other. Therefore, different 
applications running together on the same chip can experience
extremely different memory system performance: one application
can experience a severe slowdown or starvation while another is
unfairly prioritized by the memory scheduler. Existing memory
access scheduling techniques try to optimize the overall multi-core
system performance and fairness.
This paper proposes an effective memory access scheduler,
called Adaptive Time-Based Least Memory Intensive scheduling
(Adaptive TB-LMI). The goal of the proposed scheduler is to
increase the overall system performance and fairness. Adaptive
TB-LMI showed an average increase in performance and fairness
by 2.5\% and 10.2\% respectively compared to Time-Based Least
Memory Intensive scheduling (TB-LMI) (previous work providing 
the best system throughput and fairness). Adaptive TB-LMI
showed a maximum increase in performance and fairness by
9.65\% and 22.16\% respectively compared to TB-LMI. Adaptive
TB-LMI decreases the hardware area required by 30.8\% 
compared to TB-LMI.
",
)


@article(wafaa:2015:pnasglm,
author = "Wafaa S. Sayed and Ahmed G. Radwan and Hossam A. H. Fahmy",
title  = "Design of Positive, Negative, and Alternating Sign 
          Generalized Logistic Maps",
journal= "Discrete Dynamics in Nature and Society",
personalnote="Received 1 April 2015; Accepted 2 June 2015",
year   = "2015",
month  = "",
volume = "2015",
number = "",
pages  = "",
ISSN   = "",
ps     = "",
pdf    = "",
note   = "Article ID 586783, DOI: 10.1155/2015/586783",
url    = "http://dx.doi.org/10.1155/2015/586783",
abstract ="
The discrete logistic map is one of the most famous discrete chaotic 
maps that has widely-spread applications. This paper investigates a set 
of four generalized logistic maps where the conventional map is a 
special case. The proposed maps have extra degrees of freedom which 
provide different chaotic characteristics and increase the design 
flexibility required for many applications such as quantitative 
financial modeling. Based on the maximum chaotic range of the output, 
the proposed maps can be classified as: positive logistic map, mostly 
positive logistic map, negative logistic map, and mostly negative 
logistic map. Mathematical analysis for each generalized map includes: 
bifurcation diagrams relative to all parameters, effective range of 
parameters, first bifurcation point, as well as the maximum Lyapunov 
exponent (MLE). Independent, vertical, and horizontal scales of the 
bifurcation diagram are discussed for each generalized map as well as a 
new bifurcation diagram related to one of the added parameters. A 
systematic procedure to design two-constraints logistic map is discussed 
and validated through four different examples.
",
)


@InProceedings(wafaa:2015:nolta,
author = "Wafaa S. Sayed and Ahmed G. Radwan and Hossam A. H. Fahmy 
          and Abdel-Latif E. Hussein",
title  = "Scaling Parameters and Chaos in Generalized 1D Discrete Time Maps",
bookTitle= "The 2015 International Symposium on Nonlinear Theory 
            and its Applications ({NOLTA2015}), {H}ong {K}ong, {C}hina",
year   = "2015",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
Among all chaotic generators, 1D discrete maps are characterized by 
their simplicity and suitability for digital implementation, in addition 
to their widely spread applications. Generalizations on 1D discrete maps 
enhance their unpredictability and increase their reliabitity in secure 
communication and encryption. In this paper, three parameterized maps 
are discussed: scaled positive logistic map (SPLM), scaled mostly 
positive logistic map (SMPLM), and scaled tent map (STM). The impacts of 
the introduced scaling parameters on the properties of each map are 
discussed including: the bifurcation diagram versus the main system 
parameter, the main keypoints, the maximum chaotic range, and 
calculation of maximum Lyapunov exponent (MLE) versus all system 
parameters.
",
)


@InProceedings(nouh:2015:verify,
author = "Khaled Nouh and Hossam A. H. Fahmy",
title  = "Binary Floating Point Verification Using Random
          Test Vector Generation Based on {SV} Constraints",
bookTitle= "The {IEEE} International Conference on Electronics, 
            Circuits, and Systems, ({ICECS}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "433--436",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ICECS.2015.7440341",
url    = "http://dx.doi.org/10.1109/ICECS.2015.7440341",
abstract = "
Verification of Binary Floating Point (FP) Arithmetic
requires robust techniques to prove compliance with Floating
point IEEE Standard (IEEE Std 754-2008). This paper provides
a new verification methodology that uses a constraint based
random technique to generate test vectors for validating FP
arithmetic instructions. The new proposal is generic and can be
used to verify any software or hardware binary FP design. The
constraints used in verification are written in System Verilog
(SV) language and can be solved with any SV constraint solver
tool. The paper provides a case study to prove the feasibility and
usefulness of the proposed approach in finding bugs for
Addition-Subtraction and Multiplication operations.
",
)


@InProceedings(shafiey:2015:fft,
author = "Ahmed M. El-Shafiey and Mohamed E. Farag and 
          Mohammed A. El-Motaz and Omar A. Nasr and 
          Hossam A. H. Fahmy",
title  = "Two-Stage Optimization of {CORDIC}-Friendly {FFT}",
bookTitle= "The {IEEE} International Conference on Electronics, 
            Circuits, and Systems, ({ICECS}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "408--411",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ICECS.2015.7440335",
url    = "http://dx.doi.org/10.1109/ICECS.2015.7440335",
abstract = "
In this paper, authors extend the work of CORDIC-Friendly 
Fast Fourier Transform (FFT) architecture in [1].
Instead of optimizing each stage independently, a joint 
optimization of two stages of the CORDIC-Friendly FFT rotations is
considered. At no additional hardware cost, the proposed scheme
achieves up to 38 dB SQNR gain using two-iteration MSR-CORDIC 
when compared to the previous algorithm for different
FFT lengths.
",
)

@InProceedings(motaz:2015:fft,
author = "Mohammed A. El-Motaz and Ahmed M. El-Shafiey 
          and Mohamed E. Farag and Omar A. Nasr 
          and Hossam A. H. Fahmy",
title  = "Speeding-up fast fourier transform",
bookTitle= "The {IEEE} International Conference on Electronics, 
            Circuits, and Systems, ({ICECS}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "510--511",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ICECS.2015.7440365",
url    = "http://dx.doi.org/10.1109/ICECS.2015.7440365",
abstract = "
This work proposes a restructure of FFT algorithm to be more hardware 
friendly. The proposed algorithm is modeled as a combinatorial 
optimization problem. This paper presents two sub-optimal schemes of the 
proposed FFT restructure: one-stage and two-stage optimization. The 
proposed FFT algorithm is applied on 1024-point Radix-2 Single-Path 
Delay Feedback (R2SDF) architecture. The one-stage and two-stage 
optimization schemes achieve reduction in the multipliers area by 40.8\% 
and 62.5\%, respectively, compared with the conventional algorithm.
",
)


@InProceedings(amr:2015:icecs,
author = "Amr Hassan and Ramy Ahmed and Hassan Mostafa 
          and Hossam A. H. Fahmy and Ahmed Hussien",
title  = "Performance evaluation of dynamic partial reconfiguration 
          techniques for software defined radio implementation 
          on {FPGA}",
bookTitle= "The {IEEE} International Conference on Electronics, 
            Circuits, and Systems, ({ICECS}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "183--186",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ICECS.2015.7440279",
url    = "http://dx.doi.org/10.1109/ICECS.2015.7440279",
abstract = "
Reconfigurability of SRAM-based Field Programmable Gate Arrays (FPGAs) 
is the most powerful feature over ASIC designs. Dynamic Partial 
Reconfiguration (DPR) emphasizes this feature by adding more flexibility 
over runtime phase. Xilinx Virtex family of FPGAs provides four 
techniques to perform DPR; SelectMAP, Serial mode, JTAG, and ICAP. In 
this paper, each of these techniques is reviewed, evaluated, and tested 
using Convolutional encoder, an essential block from Software Defined 
Radio (SDR) system, which becomes the most promising application for 
DPR. Experiments are carried out using Xilinx Virtex 5 kit 
``XUPV5-LX110T'' to measure the trade-offs between performance and 
area-overhead by adding reconfiguration controller on/off FPGA fabric. 
It is shown that the performance of each interface is independent of 
design resource, but proportional only with partial reconfiguration 
region selection that had been chosen at design place and route phase.
",
)

@InProceedings(wafaa:2015:icecs,
author = "Wafaa S. Sayed and Abdel-Latif E. Hussien 
          and Hossam A. H. Fahmy and Ahmed G. Radwan",
title  = "Generalized chaotic maps and elementary functions 
          between analysis and implementation",
bookTitle= "The {IEEE} International Conference on Electronics, 
            Circuits, and Systems, ({ICECS}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "433--436",
ISBN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/ICECS.2015.7440363",
url    = "http://dx.doi.org/10.1109/ICECS.2015.7440363",
abstract = "
Nonlinear analysis and chaos have many applications in communications, 
cryptography, and many other fields. In this work, we aim to bridge the 
gap between mathematical analysis of generalized 1D discrete chaotic 
maps and their implementation on digital platforms. We propose several 
variations and generalizations on the logistic and tent maps and employ 
the power function z = xy in a general map that could yield each of them 
and other new maps. Finite precision logistic map is studied explaining 
the impact of finitude on its properties. In addition, floating-point 
implementations of the power function are tested on the occurrence of 
special values of the operands.
",
)


@InProceedings(wafaa:2015:icenco,
author = "Wafaa S. Sayed and Ahmed G. Radwan and Hossam A. H. Fahmy",
title  = "Design of a Generalized Bidirectional Tent Map Suitable 
          for Encryption Applications",
bookTitle= "The 11th International Computer Engineering 
             Conference  ({ICENCO2015}), {C}airo, {E}gypt",
year   = "2015",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
",
)


@article(zidan:2016:singleread,
author = "Mohamed Affan Zidan and Hesham Omran and Rawan Naous 
          and Ahmed Sultan and Hossam A.~H.~Fahmy 
          and Wei D. Lu and Khaled Nabil Salama",
title  = "Single-Readout High-Density Memristor Crossbar",
journal= "Scientific Reports", 
personalnote="Received 30 May 2015; 
              Accepted 27 November 2015;
              Published online 07 January 2016",
year   = "2016",
month  = jan,
volume = "6",
number = "",
pages  = "",
ISSN   = "2045-2322",
ps     = "",
pdf    = "",
note   = "Article ID 18863, DOI: 10.1038/srep18863",
url    = "http://dx.doi.org/10.1038/srep18863",
abstract ="
High-density memristor-crossbar architecture is a very promising 
technology for future computing systems. The simplicity of the 
gateless-crossbar structure is both its principal advantage and the 
source of undesired sneak-paths of current. This parasitic current could 
consume an enormous amount of energy and ruin the readout process. We 
introduce new adaptive-threshold readout techniques that utilize the 
locality and hierarchy properties of the computer-memory system to 
address the sneak-paths problem. The proposed methods require a single 
memory access per pixel for an array readout. Besides, the memristive 
crossbar consumes an order of magnitude less power than state-of-the-art 
readout techniques.
",
)


@InProceedings(noha:2016:noc,
author = "Noha Gamal and Hossam A. H. Fahmy 
          and Yehea Ismail and Hassan Mostafa",
title  = "Design Guidelines for Embedded {NoCs} on {FPGAs}",
bookTitle= "The 17th {IEEE} International Symposium on Quality 
           Electronic Design ({ISQED}), {S}anta {C}lara, {CA}, {USA}",
year   = "2016",
month  = mar,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
personalnote   = "DOI: ",
personalurl    = "http://dx.doi.org/",
abstract = "
Including Networks-on-Chip (NoCs) within FPGAs has become necessary to 
overcome the problems of point-to-point interconnect scheme. This will 
enable interfacing with high speed IOs and partial dynamic 
reconﬁguration (PDR), and reduce compile time and improve system 
performance. We compared FPGA-speciﬁc NoC components on soft and hard 
implementations and analyzed the efﬁciency gap between the two 
technologies to get design constraints in this space. Input module that 
includes memory buffers, implemented using block RAMs (BRAMs), has less 
1.8x area, 2.9x delay and 5.3x power. Switch has the largest gap: 90x 
area, 7x delay and 53x power. If the router is totally hard implemented, 
this will save 9x area, 3.7x delay and 12x power. By comparing our 
results with same ﬂow on ASIC-speciﬁc router, we show that using 
FPGA-speciﬁc NoCs design improves utility with 3x in area with slight 
increase in delay.
",
)

@article(amr:2016:filament,
author = "Amr M.~Hassan and Hossam A.~H.~Fahmy and Nadia H.~Rafat",
title  = "Enhanced Model of Conductive Filament-Based Memristor via 
          including Trapezoidal Electron Tunneling Barrier Effect",
journal= "{IEEE} {T}ransactions on {N}anotechnology ({TNANO})",
personalnote="Accepted 17 March 2016",
year   = "2016",
month  = may,
volume = "15",
number = "3",
pages  = "484--491",
ISSN   = "1536-125X",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/TNANO.2016.2546686",
url    = "http://dx.doi.org/10.1109/TNANO.2016.2546686",
abstract ="
Memristors exhibit very promising features such as
nonvolatility and small area. Several types of memristors have
been developed in the last decade using different materials along
with physical models explaining their behaviors. In this paper, we
modify a previously published model to account for a trapezoidal
electron tunneling barrier rather than a zero field or constant
potential barrier. The model is verified against experimental
data showing better agreement. We then perform a study to
find out the effect of different memristors parameters on its
I-V characteristics and how to shape the characteristics to fit
the applications. Finally, we provide a SPICE model which
takes into account the tunneling capacitance and clarify that
any fabricated memristor has, inherently, a memcapacitor in
parallel. The dominant element may be the memristor or the
memcapacitor depending on the frequency of operation.
",
)


@article(wafaa:2016:pow,
author = "Wafaa S. Sayed and Hossam A. H. Fahmy",
title  = "What are the Correct Results for the Special Values of the 
          Operands of the Power Operation?",
journal= "{ACM} Transactions on Mathematical Software", 
personalnote="Received 03 April 2014; 
              Reviewed 30 July 2014; 
              Revised 09 October 2014; 
              Accepted 22 July 2015;
              Published June 2016",
year   = "2016",
month  = may,
volume = "42",
number = "2",
pages  = "14:1--14:17",
ISSN   = "0098-3500",
ps     = "",
pdf    = "",
note   = "DOI: 10.1145/2809783",
url    = "http://dx.doi.org/10.1145/2809783",
issue_date = {June 2016},
articleno = {14},
numpages = {17},
acmid = {2809783},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Floating-point arithmetic, NaN, incompatibility, 
            inconsistency, indeterminate, l'H\^{o}pital's rule, limits},
abstract ="
Language standards such as C99, C11, as well as the IEEE Standard for 
Floating-Point Arithmetic 754 (IEEE Std 754-2008) specify the expected 
behavior of binary and decimal floating-point arithmetic in computer 
programming environments and the handling of special values and 
exception conditions. Many researchers focus on verifying the 
compliance of implementations for binary and decimal floating-point 
operations with these standards. In this article, we are concerned 
with the special values of the operands of the power function $Z = X^Y$. 
We study how the standards define the correct results for this 
operation, propose a mathematically justified definition for the correct 
results of the power function on the occurrence of these special values 
as its operands, test how different software implementations for the 
power function deal with these special values, and classify the behavior 
of different programming languages from the viewpoint of how much they 
conform to the standards and our proposed mathematical definition. We 
present inconsistencies between the implementations and the standards 
and we discuss incompatibilities between different versions of the same 
software.
",
)


@InProceedings(radwan:2016:actea,
author = "Ahmed G. Radwan and Wafaa S. Sayed and Hossam A. H. Fahmy",
title  = "Double-Sided Bifurcations in Tent maps: Analysis and Applications",

bookTitle= "The 3rd {I}nternational {C}onference on {A}dvances in 
            {C}omputational {T}ools for {E}ngineering {A}pplications 
            ({ACTEA}), {L}ebanon",
year   = "2016",
month  = jul,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
abstract = "
",
)

@article(nahla:2016:memristor,
author = "Nahla Elashkar and
          Mohamed Aboudina and
          Hossam A. H. Fahmy and
          Ghada Hamdy Ibrahim and 
          Ahmed Hussien Khalil",
title  = "Memristor based {BPSK} and {QPSK} Demodulators with Nonlinear 
          Dopant Drift Model",
journal= "Microelectronics Journal", 
year   = "2016",
month  = oct,
volume = "56",
number = "",
pages  = "17--24",
ISSN   = "0026-2692",
ps     = "",
pdf    = "",
personalnote   = "Received 24 April 2016, 
                  Revised 23 July 2016, 
                  Accepted 27 July 2016, 
                  Available online 10 August 2016",
note   = "DOI: 10.1016/j.mejo.2016.07.015",
url    = "http://dx.doi.org/10.1016/j.mejo.2016.07.015",
abstract ="
In this paper, the dependence of the instantaneous memristance value and 
its I–V characteristics on a periodic signal phase are studied. Hence, 
expression for the instantaneous memristance as a function of the 
periodic input phase is derived. This derivation is based on the 
memristor linear dopant drift model and is provided for sinusoidal input 
waveforms. To prove the tendency, simulations using linear and nonlinear 
dopant drift memristor models are performed in the Cadence simulation 
environment. Based on those, a set of digital communication demodulators 
are proposed and investigated exploiting the change of the average 
memristance with the initial phase of applied signal. The 
experimental-based `nonlinear' dopant drift model is used in designing 
the proposed demodulators for Binary Phase Shift Keying (BPSK) and 
Quadrature Phase Shift Keying (QPSK) modulation schemes. Since all 
proposed demodulators are asynchronous, the proposed circuits do not 
need any carrier recovery circuits. Moreover, transient simulations have 
been executed showing the proper matching to the expected performance.
",
)


@InProceedings(mohie:2016:finfet,
author = "Mohamed Mohie El-Din
          and Hossam A. H. Fahmy 
          and Yehea Ismail
          and Noha Gamal 
          and Hassan Mostafa 
         ",
title  = "Leakage Power Evaluation of {F}in{FET}-Based {FPGA} Cluster 
          Under Threshold Voltage Variation",
bookTitle= " The 11th International Design and Test Symposium, 
             {H}ammamet, {T}unisia",
year   = "2016",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
personalnote   = "DOI: ",
personalurl    = "http://dx.doi.org/",
abstract = "
",
)

@InProceedings(noha:2016:snoc,
author = "Noha Gamal and Hassan Mostafa and Hossam A. H. Fahmy 
          and Yehea Ismail and Mohamed Mohie El-Din",
title  = "Design Guidelines for Soft Implementations to 
          Embedded {NoCs} of {FPGAs}",
bookTitle= " The 11th International Design and Test Symposium, 
             {H}ammamet, {T}unisia",
year   = "2016",
month  = dec,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
personalnote   = "DOI: ",
personalurl    = "http://dx.doi.org/",
abstract = "
",
)


@article(wafaa:2017:transition,
author = "Wafaa S. Sayed and Hossam A. H. Fahmy 
          and Ahmed A. Rezk and Ahmed G. Radwan",
title  = "Generalized Smooth Transition Map Between Tent and Logistic Maps",
journal= "International Journal of Bifurcation and Chaos", 
personalnote="Received February 21, 2016; 
              Revised August 2, 2016;
              Accepted 02 November 2016;",
year   = "2017",
month  = jan,
volume = "27",
number = "01",
pages  = "1730004",
ISSN   = "0218-1274",
ps     = "",
pdf    = "http://www.worldscientific.com/doi/pdf/10.1142/S021812741730004X",
note   = "DOI:10.1142/S021812741730004X",
url    = "http://dx.doi.org/10.1142/S021812741730004X",
abstract ="
There is a continuous demand on novel chaotic generators to be employed 
in various modeling and pseudo-random number generation applications. 
This paper proposes a new chaotic map which is a general form for 
one-dimensional discrete-time maps employing the power function with the 
tent and logistic maps as special cases. The proposed map uses extra 
parameters to provide responses that fit multiple applications for which 
conventional maps were not enough. The proposed generalization covers 
also maps whose iterative relations are not based on polynomials, 
i.e., with fractional powers. We introduce a framework for analyzing the 
proposed map mathematically and predicting its behavior for various 
combinations of its parameters. In addi- tion, we present and explain 
the transition map which results in intermediate responses as the 
parameters vary from their values corresponding to tent map to those 
corresponding to logistic map case. We study the properties of the 
proposed map including graph of the map equation, general bifurcation 
diagram and its key-points, output sequences, and maximum Lyapunov 
exponent. We present further explorations such as effects of scaling, 
system response with respect to the new parameters, and operating ranges 
other than transition region. Finally, a stream cipher system based on 
the generalized transition map validates its utility for image 
encryption applications. The system allows the construction of more 
efficient encryption keys which enhances its sensitivity and other 
cryptographic properties.
",
)


@article(wafaa:2017:precision,
author = "Wafaa S. Sayed and Ahmed G. Radwan
          and Ahmed A. Rezk and Hossam A. H. Fahmy",
title  = "Finite Precision Logistic Map Between Computational Efficiency
          and Accuracy with Encryption Applications",
journal= "Complexity", 
personalnote="Received 30 July 2016; 
              Revised 8 October 2016; 
              Accepted 04 December 2016;
              Published 12 February 2017",
year   = "2017",
month  = "",
volume = "2017",
number = "",
pages  = "",
ISSN   = "1076-2787 (print); 1099-0526 (web)",
ps     = "",
pdf    = "http://downloads.hindawi.com/journals/complexity/2017/8692046.pdf",
note   = "Article ID 8692046, DOI:10.1155/2017/8692046",
url    = "http://dx.doi.org/10.1155/2017/8692046",
abstract ="
Chaotic systems appear in many applications such as pseudo-random number 
generation, text encryption and secure image transfer. Numerical 
solutions of these systems using digital software or hardware inevitably 
deviate from the expected analytical solutions. Chaotic orbits produced 
using finite precision systems do not exhibit the infinite period 
expected under the assumptions of infinite simulation time and 
precision. In this paper, digital implementation of the generalized 
logistic map with signed parameter is considered. We present a 
fixed-point hardware realization of a Pseudo-Random Number Generator 
using the logistic map that experiences a tradeoff between computational 
efficiency and accuracy. Several introduced factors such as the used 
precision, the order of execution of the operations, parameter and 
initial point values affect the properties of the finite precision map. 
For positive and negative parameter cases, the studied properties 
include bifurcation points, output range, maximum Lyapunov Exponent, and 
period length. The performance of the finite precision logistic map is 
compared in the two cases. A basic stream cipher system is realized to 
evaluate the system performance for encryption applications for 
different bus sizes regarding the encryption key size, hardware 
requirements, maximum clock frequency, NIST and correlation, histogram, 
entropy and Mean Absolute Error analyses of encrypted images.
",
)


@article(wahba:2017:fma,
author = "Ahmed A.~Wahba and Hossam A.~H.~Fahmy",
title  = "Area Efficient and Fast Combined Binary/Decimal Floating 
          Point Fused Multiply Add Unit",
journal= "{IEEE} {T}ransactions on {C}omputers",
personalnote="Accepted 13 June 2016,
              Available online 22 June 2016,
              Published 01 February 2017",
year   = "2017",
month  = feb,
volume = "66",
number = "2",
pages  = "226--239",
ISSN   = "0018-9340",
ps     = "",
pdf    = "",
note   = "DOI: 10.1109/TC.2016.2584067",
url    = "http://dx.doi.org/10.1109/TC.2016.2584067",
abstract ="
In this work we present a new 64-bit floating point Fused Multiply
Add (FMA) unit that can perform both binary and decimal addition,
multiplication, and fused-multiply-add operations. The presented FMA
has 6\% less delay than the fastest stand-alone decimal unit and
23\% less area than both binary and decimal units together. These
results were achieved by the use of: 1) column by column reduction
to reduce the partial products in the multiplier tree, 2) a new leading
zeros detector that produces its output in base-3 to simplify the
normalization shifting in the binary datapath, 3) the use of a redundant
adder to perform the final addition, 4) using a new rounding-while-redundant
technique to hide the rounding delay and remove it from the critical
path, and 5) using a new simple conversion technique from redundant
to binary/decimal.
",
)

@InProceedings(hazem:2017:gpu,
author = "Hazem Abdelhafez and Mohamed Rehan and Hossam A.~H.~Fahmy",
title  = "Efficient {GPU} Utilization in Heterogeneous Big Data Cluster 
          Using Token-Based Scheduler",
bookTitle= "The 30th {IEEE} Canadian Conference on Electrical and 
            Computer Engineering ({CCECE}), {W}indsor, {ON}, {C}anada",
year   = "2017",
month  = apr,
pages   = "",
ISBN   = "",
ps     = "",
pdf    = "",
personalnote   = "DOI: ",
personalurl    = "http://dx.doi.org/",
abstract = "
",
)


@article(emara:2017:memcap,
author  = "Ahmed Adel M.~Emara and Mohamed M.~Aboudina and Hossam A.~H.~Fahmy",
title  = "Non-volatile low-power crossbar memcapacitor-based memory",
journal= "Microelectronics Journal",
personalnoteone="Submitted 14 November 2016,
                 Revised 11 March 2017,
                 Accepted 10 April 2017,
                 Available online 02 May 2017,
                 Published ?? ??? 2017",
year   = "2017",
month  = jun,
volume = "64",
number = "",
pages  = "39--44",
ISSN   = "0026-2692",
ps     = "",
pdf    = "",
note   = "DOI: 10.1016/j.mejo.2017.04.005",
url    = "http://dx.doi.org/10.1016/j.mejo.2017.04.005",
authortrack = "http://authors.elsevier.com/TrackPaper.html?trk_article=MEJ4183&trk_surname=Aboudina",
abstract ="
This paper proposes the use of a memcapacitor as a new memory cell. This 
new element may lead to a better memory on many directions: 
non-volatility, speed, density, and power consumption. To the best of 
our knowledge, we present the first Verilog-A model for memcapacitors 
and use it to simulate the memory cell then complete crossbar arrays of 
various sizes. The reading circuits completely solve the sneak paths 
problem including the effect of coupling parasitics in the large arrays. 
Our analysis indicates that memcapacitor memories are non-volatile 
memories with a density at least equivalent to dynamic RAMs but with 
lower power consumption.
",
)


@article(mervat:2017:dbmul,
author  = "Mervat M.~A.~Mahmoud and Dalia A.~El-Dib and Hossam A.~H.~Fahmy",
title  = "Low Energy Pipelined Dual Base (Decimal/Binary) Multiplier",
journal= "Microelectronics Journal",
personalnoteone="Submitted 16 June 2016,
                 Revised 17 April 2017,
                 Accepted 11 May 2017,
                 Available online 16 May 2017,
                 Published ?? ??? 2017",
year   = "2017",
month  = jul,
volume = "65",
number = "",
pages  = "11--20",
ISSN   = "0026-2692",
ps     = "",
pdf    = "",
note   = "DOI: 10.1016/j.mejo.2017.05.004",
url    = "http://dx.doi.org/10.1016/j.mejo.2017.05.004",
authortrack = "http://authors.elsevier.com/TrackPaper.html?trk_article=MEJ4194&trk_surname=Mahmoud",
abstract ="
Combined binary/decimal arithmetic is optimal in
supporting binary and decimal high speed and low power
applications. A low energy clock-gated pipelined dual base
binary/decimal fixed-point multiplier is suggested extending a
previously proposed non-pipelined design. A thorough study
conducted on both the pipelined and non-pipelined designs
versus other architectures in literature proves tremendous
reductions in power, energy and area consumption. The non-
pipelined multiplier design saves energy and area consumptions
by up to 41\% and 37\%, respectively, retaining almost the same
delay as the fastest known design in literature. It also allows
operating frequencies of up to 4 GHz for 15 nm technology.
Then, the pipeline stages are chosen to achieve further energy
reductions with acceptable latency. In addition, clock gating the
pipelined multiplier design is introduced to provide a total of
43\% energy reduction for the pipelined design if compared to the
non-pipelined design.
",
)


@article(abdelkader:2017:finfet,
author  = "Osama Abdelkader and Mohamed Mohie El-Din and Hassan Mostafa
           and Hamdy Abdelhamid and Hossam A.~H.~Fahmy and Yehea Ismail
           and Ahmed M.~Soliman",
title  = "Technology Scaling Roadmap for {F}in{FET}-Based {FPGA} Clusters
          Under Process Variations",
journal= "Journal of Circuits, Systems and Computers",
personalnoteone="Submitted 30 August 2016,
                 Accepted 27 June 2017,
                 Available online 28 July 2017,
                 Published ?? ??? 2017",
year   = "2017",
month  = jul,
volume = "",
number = "",
pages  = "1850056",
ISSN   = "",
ps     = "",
pdf    = "",
note   = "DOI: 10.1142/S0218126618500561",
url    = "http://dx.doi.org/10.1142/S0218126618500561",
abstract ="
The technology scaling impact on FinFET-based Field-Programmable Gate 
Array ({FPGA}) components (Flip-Flops and Multiplexers) and cluster 
metrics is evaluated for technology nodes starting from 20nm down to 
7nm. Power consumption, delay and energy (Power Delay Product, or {PDP}) 
trends are reported with {FinFET} technology scaling. Cluster metrics 
are then evaluated based on three benchmarking circuits: 2-bit adder, 
4-bit {NAND} and cascaded flip-flops chain. The study shows that power, 
delay and {PDP} of the {FPGA} cluster are improved as we scale down the 
technology. An example for improvement is that for 7nm 2-bit adder, 
circuit speed is 15\% higher than its value at 20nm and PDP at 7nm is 
reduced by 43\% compared to its value at 20nm. The impacts of 
temperature and threshold voltage variations on {FPGA} cluster performance 
are also reported after evaluating a 2-bit adder circuit as a benchmark 
which is then used to calculate the design constraints to meet 99.9\% 
yield percentage.
",
)

@article(emara:2017:crossbar,
author  = "Ahmed Adel M.~Emara and Mohamed M.~Aboudina and Hossam A.~H.~Fahmy",
title  = "Adaptive and optimum multiport readout of non-gated 
          crossbar memory arrays",
journal= "Microelectronics Journal",
personalnoteone="Submitted 15 November 2016,
                 Revised(1) 19 April 2017,
                 Revised(2) 25 July 2017,
                 Accepted 10 August 2017,
                 Available online 22 August 2017,
                 Published ?? September 2017",
year   = "2017",
month  = sep,
volume = "67",
number = "",
pages  = "162--168",
ISSN   = "0026-2692",
ps     = "",
pdf    = "",
note   = "DOI: 10.1016/j.mejo.2017.08.007",
url    = "http://dx.doi.org/10.1016/j.mejo.2017.08.007",
authortrack = "http://authors.elsevier.com/TrackPaper.html?trk_article=MEJ4245&trk_surname=Aboudina",
abstract ="
Non-gated crossbar memory arrays are becoming strong candidates to 
replace the current gated arrays due to their much higher density. Sneak 
paths are the main problem in gate-less arrays. In this paper, we 
analyze a three reading multiport readout method for non-gated memory 
arrays, study its limitation versus the OFF/ON impedance ratio of the 
memory cell as well as the ratio between the 1s and 0s memory cells. We 
provide an accurate threshold that increases the noise margins that 
takes these factors into consideration and we also introduce an adaptive 
threshold that tracks the 1s and 0s density variations over time. 
Finally, we propose a sensing circuit that accumulates the three reading 
without the need for extra complicated circuitry to evaluate the read 
memory cell value.
",
)